Skip to content

Generate

Generate some fake data for various purposes.

LatentCalendarSampler

Sampler for generating synthetic calendar data from a fitted LatentCalendar model.

Parameters:

Name Type Description Default
model

a fitted LatentCalendar model

required
random_state int | None

seed for reproducibility

None
concentration_scale float

scale for Gamma-perturbing each user's Dirichlet concentration before sampling mixture weights. 1.0 (default) means no perturbation. Values > 1.0 increase variance across users.

1.0
Example

model = LatentCalendar(n_components=5).fit(X) sampler = model.create_sampler(random_state=42) df_weights, df_events = sampler.sample(n_samples=[10, 5, 20])

Source code in latent_calendar/generate.py
class LatentCalendarSampler:
    """Sampler for generating synthetic calendar data from a fitted LatentCalendar model.

    Args:
        model: a fitted LatentCalendar model
        random_state: seed for reproducibility
        concentration_scale: scale for Gamma-perturbing each user's Dirichlet
            concentration before sampling mixture weights. 1.0 (default) means
            no perturbation. Values > 1.0 increase variance across users.

    Example:
        >>> model = LatentCalendar(n_components=5).fit(X)
        >>> sampler = model.create_sampler(random_state=42)
        >>> df_weights, df_events = sampler.sample(n_samples=[10, 5, 20])

    """

    def __init__(
        self,
        model,
        random_state: int | None = None,
        concentration_scale: float = 1.0,
    ) -> None:
        self.model = model
        self.random_state = random_state
        self.concentration_scale = concentration_scale
        self._rng = np.random.default_rng(random_state)

    def sample(
        self,
        n_samples: Union[int, list[int], np.ndarray],
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Sample synthetic calendar events from the fitted model.

        Component mixture weights for each user are drawn from the population-level
        Dirichlet prior derived from the fitted model's component distribution.

        Args:
            n_samples: number of events per user. A single int produces one user
                with that many events. A list/array produces one user per element.

        Returns:
            Tuple of:
                - df_weights: mixture weight DataFrame (n_users, n_components)
                - df_events: event count DataFrame (n_users, n_time_slots)

        """
        if isinstance(n_samples, int):
            n_samples = [n_samples]

        n_samples = np.asarray(n_samples, dtype=int)
        n_users = len(n_samples)

        # Broadcast population-level concentration to (n_users, n_components)
        component_concentration = self.model.component_distribution_
        component_weights = np.broadcast_to(
            component_concentration, (n_users, len(component_concentration))
        ).copy()

        mixture_weights, event_counts = _sample_calendar(
            component_weights=component_weights,
            normalized_components=self.model.normalized_components_,
            n_samples=n_samples,
            rng=self._rng,
            concentration_scale=self.concentration_scale,
        )

        df_weights = pd.DataFrame(
            mixture_weights,
            columns=range(self.model.n_components),
        )
        columns = (
            self.model.feature_names_in_
            if hasattr(self.model, "feature_names_in_")
            else FULL_VOCAB
        )
        df_events = pd.DataFrame(event_counts, columns=columns)

        return df_weights, df_events

    def sample_events(self, n: int) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Sample events for a single user.

        Args:
            n: number of events to draw

        Returns:
            Tuple of:
                - df_weights: mixture weight DataFrame (1, n_components)
                - df_events: event count DataFrame (1, n_time_slots)

        """
        return self.sample(n_samples=n)

sample(n_samples)

Sample synthetic calendar events from the fitted model.

Component mixture weights for each user are drawn from the population-level Dirichlet prior derived from the fitted model's component distribution.

Parameters:

Name Type Description Default
n_samples Union[int, list[int], ndarray]

number of events per user. A single int produces one user with that many events. A list/array produces one user per element.

required

Returns:

Type Description
tuple[DataFrame, DataFrame]

Tuple of: - df_weights: mixture weight DataFrame (n_users, n_components) - df_events: event count DataFrame (n_users, n_time_slots)

Source code in latent_calendar/generate.py
def sample(
    self,
    n_samples: Union[int, list[int], np.ndarray],
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Sample synthetic calendar events from the fitted model.

    Component mixture weights for each user are drawn from the population-level
    Dirichlet prior derived from the fitted model's component distribution.

    Args:
        n_samples: number of events per user. A single int produces one user
            with that many events. A list/array produces one user per element.

    Returns:
        Tuple of:
            - df_weights: mixture weight DataFrame (n_users, n_components)
            - df_events: event count DataFrame (n_users, n_time_slots)

    """
    if isinstance(n_samples, int):
        n_samples = [n_samples]

    n_samples = np.asarray(n_samples, dtype=int)
    n_users = len(n_samples)

    # Broadcast population-level concentration to (n_users, n_components)
    component_concentration = self.model.component_distribution_
    component_weights = np.broadcast_to(
        component_concentration, (n_users, len(component_concentration))
    ).copy()

    mixture_weights, event_counts = _sample_calendar(
        component_weights=component_weights,
        normalized_components=self.model.normalized_components_,
        n_samples=n_samples,
        rng=self._rng,
        concentration_scale=self.concentration_scale,
    )

    df_weights = pd.DataFrame(
        mixture_weights,
        columns=range(self.model.n_components),
    )
    columns = (
        self.model.feature_names_in_
        if hasattr(self.model, "feature_names_in_")
        else FULL_VOCAB
    )
    df_events = pd.DataFrame(event_counts, columns=columns)

    return df_weights, df_events

sample_events(n)

Sample events for a single user.

Parameters:

Name Type Description Default
n int

number of events to draw

required

Returns:

Type Description
tuple[DataFrame, DataFrame]

Tuple of: - df_weights: mixture weight DataFrame (1, n_components) - df_events: event count DataFrame (1, n_time_slots)

Source code in latent_calendar/generate.py
def sample_events(self, n: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Sample events for a single user.

    Args:
        n: number of events to draw

    Returns:
        Tuple of:
            - df_weights: mixture weight DataFrame (1, n_components)
            - df_events: event count DataFrame (1, n_time_slots)

    """
    return self.sample(n_samples=n)

sample_from_latent_calendar(model, n_samples, random_state=None, concentration_scale=1.0)

Sample synthetic calendar data from a fitted LatentCalendar model.

Convenience wrapper around :class:LatentCalendarSampler.

Parameters:

Name Type Description Default
model

fitted LatentCalendar model

required
n_samples Union[int, list[int], ndarray]

number of events per user. A single int produces one user with that many events. A list/array produces one user per element.

required
random_state int | None

seed for reproducibility

None
concentration_scale float

scale for Gamma-perturbing each user's Dirichlet concentration before sampling mixture weights. 1.0 (default) means no perturbation. Values > 1.0 increase variance across users.

1.0

Returns:

Type Description
tuple[DataFrame, DataFrame]

Tuple of: - df_weights: mixture weight DataFrame (n_users, n_components) - df_events: event count DataFrame (n_users, n_time_slots)

Source code in latent_calendar/generate.py
def sample_from_latent_calendar(
    model,
    n_samples: Union[int, list[int], np.ndarray],
    random_state: int | None = None,
    concentration_scale: float = 1.0,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Sample synthetic calendar data from a fitted LatentCalendar model.

    Convenience wrapper around :class:`LatentCalendarSampler`.

    Args:
        model: fitted LatentCalendar model
        n_samples: number of events per user. A single int produces one user
            with that many events. A list/array produces one user per element.
        random_state: seed for reproducibility
        concentration_scale: scale for Gamma-perturbing each user's Dirichlet
            concentration before sampling mixture weights. 1.0 (default) means
            no perturbation. Values > 1.0 increase variance across users.

    Returns:
        Tuple of:
            - df_weights: mixture weight DataFrame (n_users, n_components)
            - df_events: event count DataFrame (n_users, n_time_slots)

    """
    return LatentCalendarSampler(
        model, random_state=random_state, concentration_scale=concentration_scale
    ).sample(n_samples)

sample_from_lda(components_prior, components_time_slots_prior, n_samples, random_state=None)

Deprecated. Use :func:sample_from_latent_calendar instead.

.. deprecated:: sample_from_lda has been removed. Build a :class:~latent_calendar.model.latent_calendar.DummyModel from a prior and use :func:sample_from_latent_calendar instead.

Source code in latent_calendar/generate.py
def sample_from_lda(
    components_prior,
    components_time_slots_prior,
    n_samples,
    random_state: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Deprecated. Use :func:`sample_from_latent_calendar` instead.

    .. deprecated::
        ``sample_from_lda`` has been removed. Build a :class:`~latent_calendar.model.latent_calendar.DummyModel`
        from a prior and use :func:`sample_from_latent_calendar` instead.

    """
    import warnings

    warnings.warn(
        "sample_from_lda is deprecated and will be removed in a future release. "
        "Use sample_from_latent_calendar instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    raise NotImplementedError(
        "sample_from_lda has been removed. Use sample_from_latent_calendar instead."
    )

wide_format_dataframe(n_rows, rate=1.0, random_state=None)

Generate some data from Poisson distribution.

Parameters:

Name Type Description Default
n_rows int

number of rows to generate

required
rate float

rate parameter for Poisson distribution

1.0
random_state int | None

random state for reproducibility

None

Returns:

Type Description
DataFrame

DataFrame with columns from FULL_VOCAB and n_rows rows

Source code in latent_calendar/generate.py
def wide_format_dataframe(
    n_rows: int,
    rate: float = 1.0,
    random_state: int | None = None,
) -> pd.DataFrame:
    """Generate some data from Poisson distribution.

    Args:
        n_rows: number of rows to generate
        rate: rate parameter for Poisson distribution
        random_state: random state for reproducibility

    Returns:
        DataFrame with columns from FULL_VOCAB and n_rows rows

    """
    if random_state is not None:
        np.random.seed(random_state)

    data = np.random.poisson(lam=rate, size=(n_rows, len(FULL_VOCAB)))

    return pd.DataFrame(data, columns=FULL_VOCAB)

Comments