Source code for biomedical_data_generator.meta

# Copyright (c) 2025 Sigrun May,
# Ostfalia Hochschule für angewandte Wissenschaften
#
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Metadata about the generated dataset."""
from __future__ import annotations

from dataclasses import asdict, dataclass, field

import numpy as np


# =========================
# Batch effects meta
# =========================
@dataclass
class BatchMeta:
    """Metadata about applied batch effects.

    Attributes:
        batch_assignments:
            Array of shape (n_samples,) with per-sample batch IDs.

        batch_intercepts:
            Mapping from batch_id to array of intercepts per affected feature.
            Structure: {batch_id: np.ndarray of shape (n_affected_features,)}.
            For example, with 3 batches and 5 affected features:
            {0: array([0.5, -0.3, 0.8, ...]), 1: array([...]), 2: array([...])}

        effect_type:
            Type of batch effect ("additive" or "multiplicative").

        effect_strength:
            Standard deviation of batch intercepts (controls magnitude).

        confounding_with_class:
            Degree of correlation between batch and class (0.0–1.0).
            0.0 = independent, 1.0 = perfect confounding.

        proportions:
            Proportions of samples per batch (if specified).

        affected_feature_indices:
            List of feature indices affected by batch effects
            (None if all features are affected).
    """

    batch_assignments: np.ndarray  # (n_samples,)
    batch_intercepts: dict[int, np.ndarray]  # batch_id -> intercepts per affected feature
    effect_type: str  # "additive" or "multiplicative"
    effect_strength: float
    confounding_with_class: float
    proportions: tuple[float, ...] | None = None
    affected_feature_indices: list[int] | None = None


# =========================
# Ground-truth dataset meta
# =========================

[docs]
@dataclass(frozen=True)
class DatasetMeta:
    """Metadata about the generated dataset.

    This captures the resolved ground-truth structure of the dataset
    (feature roles, cluster layout, anchor properties) plus a snapshot
    of the generator configuration.
    """

    # ---------------- core feature layout ----------------

    # Human-readable column names (same order as in X)
    feature_names: list[str]

    # Index sets (0-based column indices)
    informative_idx: list[int]  # includes cluster anchors + free informative features
    noise_idx: list[int]  # independent / free noise features (no anchors)

    # Correlated clusters
    corr_cluster_indices: dict[int, list[int]]  # cluster_id -> list of column indices
    anchor_idx: dict[int, int | None]  # cluster_id -> anchor col (or None)

    # Per-cluster properties (mirroring CorrClusterConfig)
    anchor_role: dict[int, str]  # "informative" | "noise"
    anchor_effect_size: dict[int, float]  # numeric effect size used for the anchor
    anchor_class: dict[int, int | None]  # class index the anchor predicts (one-vs-rest)
    cluster_label: dict[int, str | None]  # descriptive label per cluster (didactic tag)

    # ---------------- provenance / global settings ----------------

    n_classes: int
    class_names: list[str]
    samples_per_class: dict[int, int]
    class_sep: list[float]  # resolved class separation per boundary
    corr_between: float  # correlation between different clusters/roles

    # ---------------- batch effects (optional) ----------------

    batch_labels: np.ndarray | None = None  # shape (n_samples,)
    batch_effects: np.ndarray | None = None
    # If present, raw effects as returned by apply_batch_effects:
    # - scalar granularity: shape (n_batches,)
    # - per-feature summary: shape (n_batches, n_affected_features)
    batch_config: dict[str, object] | None = None  # serialized BatchEffectsConfig

    # ---------------- generator config snapshot ----------------

    random_state: int | None = None
    resolved_config: dict[str, object] = field(default_factory=dict)


[docs]
    def to_dict(self) -> dict[str, object]:
        """Convert to a plain dictionary (e.g., for JSON serialization)."""
        return asdict(self)