Source code for biomedical_data_generator.meta
# Copyright (c) 2025 Sigrun May,
# Ostfalia Hochschule für angewandte Wissenschaften
#
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT
"""Metadata about the generated dataset."""
from dataclasses import asdict, dataclass, field
import numpy as np
from biomedical_data_generator.config import AnchorMode
@dataclass
class BatchMeta:
"""Metadata about batch effects."""
batch_ids: np.ndarray # (n_samples,)
batch_offsets: np.ndarray # (n_batches,)
batches_majority_class: np.ndarray | None # (n_batches,) or None
scope: str
sd: float
# =========================
# Ground-truth meta
# =========================
[docs]
@dataclass(frozen=True)
class DatasetMeta:
"""Metadata about the generated dataset."""
feature_names: list[str]
informative_idx: list[int] # includes cluster anchors + free i*
pseudo_idx: list[int] # corr* proxies + free p*
noise_idx: list[int]
# Correlated cluster structure
corr_cluster_indices: dict[int, list[int]] # cluster_id -> column indices
anchor_idx: dict[int, int | None] # cluster_id -> anchor col (or None)
anchor_role: dict[int, str] # "informative" | "pseudo" | "noise"
anchor_effect_size: dict[int, float] # effect size (beta) for the anchor
anchor_target_cls: dict[int, int | None] # target class for the anchor (one-vs-rest)
cluster_label: dict[int, str | None] # didactic tags per cluster
# Class distribution
y_weights: tuple[float, ...]
y_counts: dict[int, int]
# Provenance / signal settings
n_classes: int
class_sep: float
corr_between: float
# --- optional (with defaults) ---
anchor_strength: float = 1.0
anchor_mode: AnchorMode = "equalized"
spread_non_anchors: bool = True
random_state: int | None = None
resolved_config: dict[str, object] = field(default_factory=dict)
[docs]
def to_dict(self) -> dict[str, object]:
"""Convert to a dictionary (e.g., for JSON serialization)."""
return asdict(self)