Source code for biomedical_data_generator.utils.export_utils

# biomedical_data_generator/utils/export_utils.py

"""Export utilities for saving generated datasets to various formats."""

from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
from numpy.typing import NDArray

from biomedical_data_generator.meta import DatasetMeta

__all__ = [
    "to_labeled_dataframe",
    "to_csv",
    "to_parquet",
]


[docs] def to_labeled_dataframe( x: pd.DataFrame | NDArray[np.float64], y: NDArray[np.int64] | None = None, meta: DatasetMeta | None = None, *, include_labels: bool = True, label_col_name: str = "y", label_str_col_name: str = "y_label", feature_names: list[str] | None = None, ) -> pd.DataFrame: """Convert generated dataset to DataFrame with optional labels. Flexible conversion supporting multiple use cases: 1. Full conversion: x + y + meta → df with features + labels 2. Features only: x + meta → df with features (no labels) 3. Custom names: override default column names Args: x: Feature matrix (DataFrame or ndarray). y: Optional class labels (integers 0 to n_classes-1). meta: Optional dataset metadata. include_labels: If True and y provided, add label columns. label_col_name: Column name for numeric labels. label_str_col_name: Column name for string labels. feature_names: Override meta.feature_names (for custom naming). Returns: DataFrame with requested columns. Raises: ValueError: If shapes mismatch or required args missing. Examples: >>> # Standard usage >>> df = to_labeled_dataframe(x, y, meta) >>> # Features only >>> df_features = to_labeled_dataframe(x, meta=meta, include_labels=False) >>> # Custom column names >>> df = to_labeled_dataframe(x, y, meta, ... label_col_name="class", ... label_str_col_name="diagnosis") """ # Determine feature names if feature_names is None: if meta is None: raise ValueError("Either meta or feature_names must be provided") feature_names = meta.feature_names # Convert X to DataFrame if isinstance(x, pd.DataFrame): df = x.copy() # Rename columns if needed if list(df.columns) != feature_names: df.columns = feature_names else: df = pd.DataFrame(x, columns=feature_names) # Add labels if requested if include_labels: if y is None: raise ValueError("y must be provided when include_labels=True") # Validate shape if df.shape[0] != len(y): raise ValueError(f"Shape mismatch: X has {df.shape[0]} samples but y has {len(y)}") # Add numeric labels in first column df.insert(0, label_col_name, y) # Add string labels if meta available if meta is not None and hasattr(meta, "class_labels"): df[label_str_col_name] = [meta.class_labels[int(i)] for i in y] return df
[docs] def to_csv( x: pd.DataFrame | NDArray[np.float64], y: NDArray[np.int64], meta: DatasetMeta, filepath: str | Path, *, include_labels: bool = True, index: bool = False, **csv_kwargs, ) -> None: """Export dataset to CSV file. Convenience wrapper around to_dataframe() + DataFrame.to_csv(). Args: x: Feature matrix. y: Class labels. meta: Dataset metadata. filepath: Output path (e.g., "data/train.csv"). include_labels: If True, include label columns. index: If True, write row indices to CSV. **csv_kwargs: Additional arguments for pd.DataFrame.to_csv() (e.g., index=False, sep=';'). Examples: >>> to_csv(x, y, meta, "output/dataset.csv", index=False) """ df = to_labeled_dataframe(x, y, meta, include_labels=include_labels) df.to_csv(filepath, index=index, **csv_kwargs)
[docs] def to_parquet( X: pd.DataFrame | NDArray[np.float64], y: NDArray[np.int64], meta: DatasetMeta, filepath: str | Path, *, include_labels: bool = True, **parquet_kwargs, ) -> None: """Export dataset to Parquet file (efficient for large datasets). Args: X: Feature matrix. y: Class labels. meta: Dataset metadata. filepath: Output path (e.g., "data/train.parquet"). include_labels: If True, include label columns. **parquet_kwargs: Additional arguments for pd.DataFrame.to_parquet() (e.g., compression='gzip', engine='pyarrow'). Examples: >>> to_parquet(X, y, meta, "output/dataset.parquet") """ df = to_labeled_dataframe(X, y, meta, include_labels=include_labels) df.to_parquet(filepath, **parquet_kwargs)