Spaces:

simeetnayan
/

odse

Sleeping

File size: 19,624 Bytes

"""Dataset registry for the ODSE sandbox environment.

All datasets are generated in-process using ``sklearn.datasets`` and
plain ``numpy`` / ``pandas`` - **no network downloads required**.

Each dataset bundles a DataFrame with metadata (problem type, target column,
feature columns). Datasets are keyed by ``(name, difficulty)`` and loaded
lazily via factory functions.

Adding a new dataset
--------------------
1. Write a loader function that returns ``DatasetConfig``.
2. Add an entry to ``_REGISTRY`` at the bottom of this file.
"""

from __future__ import annotations

from typing import Callable, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from sklearn.datasets import (
    load_breast_cancer,
    load_iris,
    load_wine,
    load_diabetes,
    load_digits,
    load_linnerud,
    make_classification,
    make_regression,
)

from ..models import Difficulty, ProblemType

# ============================================================================
# DatasetConfig
# ============================================================================

class DatasetConfig:
    """Bundles a DataFrame with modelling metadata.

    Parameters
    ----------
    df : pd.DataFrame
        The raw dataset.
    target_column : str
        Name of the target column.
    problem_type : ProblemType
        Classification or regression.
    problem_description : str
        Human-readable objective for the dataset domain problem.
    feature_columns : List[str] | None
        Explicit feature list; if *None*, all non-target / non-excluded
        columns are used.
    exclude_columns : List[str] | None
        Columns to exclude from features (IDs, free text, ...).
    """

    def __init__(
        self,
        df: pd.DataFrame,
        target_column: str,
        problem_type: ProblemType,
        problem_description: str = "",
        feature_columns: Optional[List[str]] = None,
        exclude_columns: Optional[List[str]] = None,
    ) -> None:
        self.df = df
        self.target_column = target_column
        self.problem_type = problem_type
        self.problem_description = problem_description
        self.exclude_columns = exclude_columns or []
        self.feature_columns: List[str] = feature_columns or [
            c
            for c in df.columns
            if c != target_column and c not in self.exclude_columns
        ]

# ============================================================================
# Public API
# ============================================================================

def load_dataset(
    name: str,
    difficulty: Difficulty | str = Difficulty.EASY,
) -> DatasetConfig:
    """Load a dataset by *name* and *difficulty*.

    Falls back to a difficulty-agnostic entry if the exact key is missing.
    Raises ``ValueError`` when no match is found.
    """
    if isinstance(difficulty, str):
        difficulty = Difficulty(difficulty)

    key: _RegistryKey = (name, difficulty)
    loader = _REGISTRY.get(key)

    if loader is None:
        # Fall back to difficulty-agnostic entry
        loader = _REGISTRY.get((name, None))

    if loader is None:
        available = sorted([k[0] for k in _REGISTRY])
        raise ValueError(
            f"Unknown dataset '{name}'. Available: {available}"
        )
    cfg = loader()
    if not cfg.problem_description:
        cfg.problem_description = _default_problem_description(name, cfg.problem_type)
    return cfg

def list_datasets() -> List[Dict[str, str]]:
    """Return a summary of all registered datasets."""
    datasets: Dict[str, List[str]] = {}
    for name, diff in _REGISTRY:
        datasets.setdefault(name, [])
        if diff is not None:
            datasets[name].append(diff.value)
    return [
        {"name": n, "difficulties": sorted(d)} for n, d in datasets.items()
    ]


def _default_problem_description(name: str, problem_type: ProblemType) -> str:
    """Return a default domain-aware objective for *name*."""
    descriptions: Dict[str, str] = {
        "breast_cancer": (
            "Predict whether a tumor is malignant or benign from cell-nuclei measurements."
        ),
        "iris": (
            "Classify iris flowers into species using sepal and petal measurements."
        ),
        "wine": (
            "Predict wine cultivar class from physicochemical properties."
        ),
        "synth_cls": (
            "Predict the class label from synthetic tabular features."
        ),
        "regression": (
            "Predict a continuous target value from synthetic tabular features."
        ),
        "house_price": (
            "Estimate house sale price from property attributes and neighborhood context."
        ),
        "diabetes": (
            "Predict quantitative diabetes progression from baseline clinical measurements."
        ),
        "digits": (
            "Classify handwritten digit images based on pixel-intensity features."
        ),
        "linnerud": (
            "Predict pulse rate from physiological exercise measurements."
        ),
    }
    return descriptions.get(
        name,
        (
            "Predict the target column from available features."
            if problem_type == ProblemType.REGRESSION
            else "Classify each example into the correct target class."
        ),
    )

# ============================================================================
# Helpers
# ============================================================================

def _inject_nulls(
    df: pd.DataFrame,
    columns: List[str],
    fraction: float,
    seed: int,
) -> pd.DataFrame:
    """Inject NaN into *columns* at the given *fraction*."""
    rng = np.random.RandomState(seed)
    df = df.copy()
    for col in columns:
        if col in df.columns:
            mask = rng.rand(len(df)) < fraction
            df.loc[mask, col] = np.nan
    return df

def _add_categorical_column(
    df: pd.DataFrame,
    col_name: str,
    categories: List[str],
    seed: int,
) -> pd.DataFrame:
    """Add a random categorical column to *df*."""
    rng = np.random.RandomState(seed)
    df = df.copy()
    df[col_name] = rng.choice(categories, size=len(df))
    return df

# ============================================================================
# Dataset Loaders - all offline, no network required
# ============================================================================

# -- Breast Cancer (binary classification) -----------------------------------

def _load_breast_cancer_easy() -> DatasetConfig:
    """Breast cancer - binary classification, clean, 30 numeric features."""
    bunch = load_breast_cancer()
    df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    df["target"] = bunch.target
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.CLASSIFICATION,
    )

def _load_breast_cancer_medium() -> DatasetConfig:
    """Breast cancer with ~15 % nulls injected."""
    cfg = _load_breast_cancer_easy()
    df = _inject_nulls(
        cfg.df,
        columns=["mean radius", "mean texture", "mean perimeter", "mean area"],
        fraction=0.15,
        seed=123,
    )
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.CLASSIFICATION,
    )

def _load_breast_cancer_hard() -> DatasetConfig:
    """Breast cancer with ~25 % nulls + noise columns."""
    cfg = _load_breast_cancer_easy()
    rng = np.random.RandomState(456)
    df = _inject_nulls(
        cfg.df,
        columns=[c for c in cfg.df.columns if c != "target"],
        fraction=0.25,
        seed=456,
    )
    df["noise_a"] = rng.randn(len(df))
    df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.CLASSIFICATION,
    )

# -- Iris (multi-class classification) ---------------------------------------

def _load_iris_easy() -> DatasetConfig:
    """Iris - 3-class classification, 4 clean numeric features."""
    bunch = load_iris()
    df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    df["species"] = pd.Categorical.from_codes(bunch.target, bunch.target_names)
    return DatasetConfig(
        df=df,
        target_column="species",
        problem_type=ProblemType.CLASSIFICATION,
    )

# -- Wine (multi-class classification) ---------------------------------------

def _load_wine_easy() -> DatasetConfig:
    """Wine - 3-class classification, 13 numeric features."""
    bunch = load_wine()
    df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    df["quality_class"] = bunch.target
    return DatasetConfig(
        df=df,
        target_column="quality_class",
        problem_type=ProblemType.CLASSIFICATION,
    )

def _load_wine_medium() -> DatasetConfig:
    """Wine with nulls + a categorical column."""
    cfg = _load_wine_easy()
    df = _inject_nulls(cfg.df, columns=["alcohol", "ash", "magnesium"], fraction=0.20, seed=321)
    df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=321)
    return DatasetConfig(
        df=df,
        target_column="quality_class",
        problem_type=ProblemType.CLASSIFICATION,
    )

# -- Synthetic classification (scalable) -------------------------------------

def _load_synth_cls_easy() -> DatasetConfig:
    """Synthetic binary classification - 10 features, 500 samples, clean."""
    X, y = make_classification(
        n_samples=500, n_features=10, n_informative=6,
        n_redundant=2, n_classes=2, random_state=42,
    )
    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
    df["target"] = y
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.CLASSIFICATION,
    )

def _load_synth_cls_hard() -> DatasetConfig:
    """Synthetic multi-class - 20 features, 1000 samples, nulls + noise."""
    X, y = make_classification(
        n_samples=1000, n_features=20, n_informative=10,
        n_redundant=4, n_classes=4, n_clusters_per_class=2,
        random_state=42,
    )
    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
    df["target"] = y
    df = _inject_nulls(df, columns=["f0", "f3", "f7", "f12"], fraction=0.15, seed=99)
    df = _add_categorical_column(df, "group", ["A", "B", "C"], seed=99)
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.CLASSIFICATION,
    )

# -- Regression (make_regression based) --------------------------------------

def _load_regression_easy() -> DatasetConfig:
    """Simple regression - 8 features, 400 samples, clean."""
    X, y = make_regression(
        n_samples=400, n_features=8, n_informative=5,
        noise=10.0, random_state=42,
    )
    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
    df["target"] = y
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )

def _load_regression_medium() -> DatasetConfig:
    """Medium regression - 12 features, 600 samples, some nulls."""
    X, y = make_regression(
        n_samples=600, n_features=12, n_informative=7,
        noise=15.0, random_state=42,
    )
    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
    df["target"] = y
    df = _inject_nulls(df, columns=["f1", "f4", "f8"], fraction=0.10, seed=55)
    df = _add_categorical_column(df, "category", ["low", "mid", "high"], seed=55)
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )

def _load_regression_hard() -> DatasetConfig:
    """Hard regression - 20 features, 1000 samples, heavy nulls + noise cols."""
    X, y = make_regression(
        n_samples=1000, n_features=20, n_informative=10,
        noise=25.0, random_state=42,
    )
    df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
    df["target"] = y
    df = _inject_nulls(
        df, columns=[f"f{i}" for i in range(0, 20, 3)], fraction=0.20, seed=77
    )

    rng = np.random.RandomState(77)
    df["noise_a"] = rng.randn(len(df))
    df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
    df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=77)
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )

# -- House price (synthetic, realistic column names) -------------------------

def _load_house_price() -> DatasetConfig:
    """Synthetic house-price dataset with realistic column names."""
    rng = np.random.RandomState(42)
    n = 600
    sqft = rng.normal(1800, 400, n).clip(600, 5000)
    bedrooms = rng.choice([1, 2, 3, 4, 5], n, p=[0.05, 0.20, 0.40, 0.25, 0.10])
    bathrooms = rng.choice([1, 2, 3], n, p=[0.25, 0.50, 0.25])
    age = rng.randint(0, 80, n)
    garage = rng.choice([0, 1, 2], n, p=[0.2, 0.5, 0.3])
    neighborhood = rng.choice(["downtown", "suburb", "rural"], n, p=[0.3, 0.5, 0.2])

    price = (
        50_000
        + 120 * sqft
        + 15_000 * bedrooms
        + 12_000 * bathrooms
        - 800 * age
        + 20_000 * garage
        + rng.normal(0, 25_000, n)
    )

    df = pd.DataFrame({
        "sqft": sqft.astype(int),
        "bedrooms": bedrooms,
        "bathrooms": bathrooms,
        "age": age,
        "garage": garage,
        "neighborhood": neighborhood,
        "price": price.round(0).astype(int),
    })
    return DatasetConfig(
        df=df,
        target_column="price",
        problem_type=ProblemType.REGRESSION,
    )

# -- Diabetes (regression) ---------------------------------------------------
def _load_diabetes_easy() -> DatasetConfig:
    """Diabetes dataset - regression task, 10 numeric features, clean."""
    bunch = load_diabetes()
    df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    df["target"] = bunch.target
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )

def _load_diabetes_medium() -> DatasetConfig:
    """Diabetes with moderate nulls + one categorical feature."""
    cfg = _load_diabetes_easy()
    df = _inject_nulls(
        cfg.df, columns=["bmi", "bp", "s5"], fraction=0.12, seed=123
    )
    df = _add_categorical_column(df, "sex_group", ["low", "normal", "high"], seed=123)
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )

def _load_diabetes_hard() -> DatasetConfig:
    """Diabetes hard - heavy nulls + noise columns."""
    cfg = _load_diabetes_easy()
    df = _inject_nulls(
        cfg.df, columns=list(cfg.df.columns[:-1]), fraction=0.22, seed=456
    )
    rng = np.random.RandomState(456)
    df["noise1"] = rng.randn(len(df))
    df["noise2"] = rng.choice(["type_a", "type_b"], size=len(df))
    return DatasetConfig(
        df=df,
        target_column="target",
        problem_type=ProblemType.REGRESSION,
    )


# -- Digits (multi-class classification) -------------------------------------
def _load_digits_easy() -> DatasetConfig:
    """Handwritten digits - 10-class classification, 64 pixel features."""
    bunch = load_digits()
    df = pd.DataFrame(bunch.data, columns=[f"pixel_{i}" for i in range(64)])
    df["digit"] = bunch.target
    return DatasetConfig(
        df=df,
        target_column="digit",
        problem_type=ProblemType.CLASSIFICATION,
    )

def _load_digits_medium() -> DatasetConfig:
    """Digits with light nulls (tests imputation on high-dim data)."""
    cfg = _load_digits_easy()
    df = _inject_nulls(
        cfg.df, columns=[f"pixel_{i}" for i in range(0, 64, 8)], fraction=0.08, seed=42
    )
    return DatasetConfig(
        df=df,
        target_column="digit",
        problem_type=ProblemType.CLASSIFICATION,
    )


# -- Linnerud (real-world exercise physiology regression) ---------------------
def _load_linnerud_easy() -> DatasetConfig:
    """Linnerud - predict pulse from exercise and body measurements."""
    bunch = load_linnerud()
    features = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    targets = pd.DataFrame(bunch.target, columns=bunch.target_names)
    df = features.copy()
    df["pulse"] = targets["Pulse"]
    return DatasetConfig(
        df=df,
        target_column="pulse",
        problem_type=ProblemType.REGRESSION,
    )


def _load_linnerud_medium() -> DatasetConfig:
    """Linnerud with moderate missingness and one categorical context column."""
    cfg = _load_linnerud_easy()
    df = _inject_nulls(
        cfg.df,
        columns=["Chins", "Situps", "Weight", "Waist"],
        fraction=0.12,
        seed=551,
    )
    df = _add_categorical_column(
        df,
        "activity_group",
        ["beginner", "intermediate", "advanced"],
        seed=551,
    )
    return DatasetConfig(
        df=df,
        target_column="pulse",
        problem_type=ProblemType.REGRESSION,
    )


def _load_linnerud_hard() -> DatasetConfig:
    """Linnerud hard mode with heavier nulls and distractor features."""
    cfg = _load_linnerud_easy()
    df = _inject_nulls(
        cfg.df,
        columns=[c for c in cfg.df.columns if c != "pulse"],
        fraction=0.22,
        seed=552,
    )
    rng = np.random.RandomState(552)
    df["noise_a"] = rng.randn(len(df))
    df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
    return DatasetConfig(
        df=df,
        target_column="pulse",
        problem_type=ProblemType.REGRESSION,
    )

# ============================================================================
# Registry - (name, Difficulty | None) -> loader callable
# ============================================================================

_RegistryKey = Tuple[str, Optional[Difficulty]]

_REGISTRY: Dict[_RegistryKey, Callable[[], DatasetConfig]] = {
    # -- Classification ------------------------------------------------------
    ("breast_cancer", Difficulty.EASY): _load_breast_cancer_easy,
    ("breast_cancer", Difficulty.MEDIUM): _load_breast_cancer_medium,
    ("breast_cancer", Difficulty.HARD): _load_breast_cancer_hard,
    ("breast_cancer", None): _load_breast_cancer_easy,
    ("iris", Difficulty.EASY): _load_iris_easy,
    ("iris", None): _load_iris_easy,
    ("wine", Difficulty.EASY): _load_wine_easy,
    ("wine", Difficulty.MEDIUM): _load_wine_medium,
    ("wine", None): _load_wine_easy,
    ("synth_cls", Difficulty.EASY): _load_synth_cls_easy,
    ("synth_cls", Difficulty.HARD): _load_synth_cls_hard,
    ("synth_cls", None): _load_synth_cls_easy,
    ("diabetes", Difficulty.EASY): _load_diabetes_easy,
    ("diabetes", Difficulty.MEDIUM): _load_diabetes_medium,
    ("diabetes", Difficulty.HARD): _load_diabetes_hard,
    ("diabetes", None): _load_diabetes_easy,
    ("digits", Difficulty.EASY): _load_digits_easy,
    ("digits", Difficulty.MEDIUM): _load_digits_medium,
    ("digits", None): _load_digits_easy,
    # -- Regression ----------------------------------------------------------
    ("regression", Difficulty.EASY): _load_regression_easy,
    ("regression", Difficulty.MEDIUM): _load_regression_medium,
    ("regression", Difficulty.HARD): _load_regression_hard,
    ("regression", None): _load_regression_easy,
    ("house_price", Difficulty.EASY): _load_house_price,
    ("house_price", None): _load_house_price,
    ("linnerud", Difficulty.EASY): _load_linnerud_easy,
    ("linnerud", Difficulty.MEDIUM): _load_linnerud_medium,
    ("linnerud", Difficulty.HARD): _load_linnerud_hard,
    ("linnerud", None): _load_linnerud_easy,
}