Source code for biomedical_data_generator.meta

# Copyright (c) 2025 Sigrun May,
# Ostfalia Hochschule für angewandte Wissenschaften
#
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Metadata about the generated dataset."""
from __future__ import annotations

from dataclasses import asdict, dataclass, field

import numpy as np


# =========================
# Batch effects meta
# =========================
@dataclass
class BatchMeta:
    """Metadata about applied batch effects.

    Attributes:
        batch_assignments:
            Array of shape (n_samples,) with per-sample batch IDs.

        batch_intercepts:
            Mapping from batch_id to array of intercepts per affected feature.
            Structure: {batch_id: np.ndarray of shape (n_affected_features,)}.
            For example, with 3 batches and 5 affected features:
            {0: array([0.5, -0.3, 0.8, ...]), 1: array([...]), 2: array([...])}

        effect_type:
            Type of batch effect ("additive" or "multiplicative").

        effect_strength:
            Standard deviation of batch intercepts (controls magnitude).

        confounding_with_class:
            Degree of correlation between batch and class (0.0–1.0).
            0.0 = independent, 1.0 = perfect confounding.

        proportions:
            Proportions of samples per batch (if specified).

        affected_feature_indices:
            List of feature indices affected by batch effects
            (None if all features are affected).
    """

    batch_assignments: np.ndarray  # (n_samples,)
    batch_intercepts: dict[int, np.ndarray]  # batch_id -> intercepts per affected feature
    effect_type: str  # "additive" or "multiplicative"
    effect_strength: float
    confounding_with_class: float
    proportions: tuple[float, ...] | None = None
    affected_feature_indices: list[int] | None = None


# =========================
# Ground-truth dataset meta
# =========================
[docs] @dataclass(frozen=True) class DatasetMeta: """Metadata about the generated dataset. This captures the resolved ground-truth structure of the dataset (feature roles, cluster layout, anchor properties) plus a snapshot of the generator configuration. """ # ---------------- core feature layout ---------------- # Human-readable column names (same order as in X) feature_names: list[str] # Index sets (0-based column indices) informative_idx: list[int] # includes cluster anchors + free informative features noise_idx: list[int] # independent / free noise features (no anchors) # Correlated clusters corr_cluster_indices: dict[int, list[int]] # cluster_id -> list of column indices anchor_idx: dict[int, int | None] # cluster_id -> anchor col (or None) # Per-cluster properties (mirroring CorrClusterConfig) anchor_role: dict[int, str] # "informative" | "noise" anchor_effect_size: dict[int, float] # numeric effect size used for the anchor anchor_class: dict[int, int | None] # class index the anchor predicts (one-vs-rest) cluster_label: dict[int, str | None] # descriptive label per cluster (didactic tag) # ---------------- provenance / global settings ---------------- n_classes: int class_names: list[str] samples_per_class: dict[int, int] class_sep: list[float] # resolved class separation per boundary corr_between: float # correlation between different clusters/roles # ---------------- batch effects (optional) ---------------- batch_labels: np.ndarray | None = None # shape (n_samples,) batch_effects: np.ndarray | None = None # If present, raw effects as returned by apply_batch_effects: # - scalar granularity: shape (n_batches,) # - per-feature summary: shape (n_batches, n_affected_features) batch_config: dict[str, object] | None = None # serialized BatchEffectsConfig # ---------------- generator config snapshot ---------------- random_state: int | None = None resolved_config: dict[str, object] = field(default_factory=dict)
[docs] def to_dict(self) -> dict[str, object]: """Convert to a plain dictionary (e.g., for JSON serialization).""" return asdict(self)