odse / core /data /datasets.py
simeetnayan's picture
Upload folder using huggingface_hub
4e680fd verified
"""Dataset registry for the ODSE sandbox environment.
All datasets are generated in-process using ``sklearn.datasets`` and
plain ``numpy`` / ``pandas`` - **no network downloads required**.
Each dataset bundles a DataFrame with metadata (problem type, target column,
feature columns). Datasets are keyed by ``(name, difficulty)`` and loaded
lazily via factory functions.
Adding a new dataset
--------------------
1. Write a loader function that returns ``DatasetConfig``.
2. Add an entry to ``_REGISTRY`` at the bottom of this file.
"""
from __future__ import annotations
from typing import Callable, Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
from sklearn.datasets import (
load_breast_cancer,
load_iris,
load_wine,
load_diabetes,
load_digits,
load_linnerud,
make_classification,
make_regression,
)
from ..models import Difficulty, ProblemType
# ============================================================================
# DatasetConfig
# ============================================================================
class DatasetConfig:
"""Bundles a DataFrame with modelling metadata.
Parameters
----------
df : pd.DataFrame
The raw dataset.
target_column : str
Name of the target column.
problem_type : ProblemType
Classification or regression.
problem_description : str
Human-readable objective for the dataset domain problem.
feature_columns : List[str] | None
Explicit feature list; if *None*, all non-target / non-excluded
columns are used.
exclude_columns : List[str] | None
Columns to exclude from features (IDs, free text, ...).
"""
def __init__(
self,
df: pd.DataFrame,
target_column: str,
problem_type: ProblemType,
problem_description: str = "",
feature_columns: Optional[List[str]] = None,
exclude_columns: Optional[List[str]] = None,
) -> None:
self.df = df
self.target_column = target_column
self.problem_type = problem_type
self.problem_description = problem_description
self.exclude_columns = exclude_columns or []
self.feature_columns: List[str] = feature_columns or [
c
for c in df.columns
if c != target_column and c not in self.exclude_columns
]
# ============================================================================
# Public API
# ============================================================================
def load_dataset(
name: str,
difficulty: Difficulty | str = Difficulty.EASY,
) -> DatasetConfig:
"""Load a dataset by *name* and *difficulty*.
Falls back to a difficulty-agnostic entry if the exact key is missing.
Raises ``ValueError`` when no match is found.
"""
if isinstance(difficulty, str):
difficulty = Difficulty(difficulty)
key: _RegistryKey = (name, difficulty)
loader = _REGISTRY.get(key)
if loader is None:
# Fall back to difficulty-agnostic entry
loader = _REGISTRY.get((name, None))
if loader is None:
available = sorted([k[0] for k in _REGISTRY])
raise ValueError(
f"Unknown dataset '{name}'. Available: {available}"
)
cfg = loader()
if not cfg.problem_description:
cfg.problem_description = _default_problem_description(name, cfg.problem_type)
return cfg
def list_datasets() -> List[Dict[str, str]]:
"""Return a summary of all registered datasets."""
datasets: Dict[str, List[str]] = {}
for name, diff in _REGISTRY:
datasets.setdefault(name, [])
if diff is not None:
datasets[name].append(diff.value)
return [
{"name": n, "difficulties": sorted(d)} for n, d in datasets.items()
]
def _default_problem_description(name: str, problem_type: ProblemType) -> str:
"""Return a default domain-aware objective for *name*."""
descriptions: Dict[str, str] = {
"breast_cancer": (
"Predict whether a tumor is malignant or benign from cell-nuclei measurements."
),
"iris": (
"Classify iris flowers into species using sepal and petal measurements."
),
"wine": (
"Predict wine cultivar class from physicochemical properties."
),
"synth_cls": (
"Predict the class label from synthetic tabular features."
),
"regression": (
"Predict a continuous target value from synthetic tabular features."
),
"house_price": (
"Estimate house sale price from property attributes and neighborhood context."
),
"diabetes": (
"Predict quantitative diabetes progression from baseline clinical measurements."
),
"digits": (
"Classify handwritten digit images based on pixel-intensity features."
),
"linnerud": (
"Predict pulse rate from physiological exercise measurements."
),
}
return descriptions.get(
name,
(
"Predict the target column from available features."
if problem_type == ProblemType.REGRESSION
else "Classify each example into the correct target class."
),
)
# ============================================================================
# Helpers
# ============================================================================
def _inject_nulls(
df: pd.DataFrame,
columns: List[str],
fraction: float,
seed: int,
) -> pd.DataFrame:
"""Inject NaN into *columns* at the given *fraction*."""
rng = np.random.RandomState(seed)
df = df.copy()
for col in columns:
if col in df.columns:
mask = rng.rand(len(df)) < fraction
df.loc[mask, col] = np.nan
return df
def _add_categorical_column(
df: pd.DataFrame,
col_name: str,
categories: List[str],
seed: int,
) -> pd.DataFrame:
"""Add a random categorical column to *df*."""
rng = np.random.RandomState(seed)
df = df.copy()
df[col_name] = rng.choice(categories, size=len(df))
return df
# ============================================================================
# Dataset Loaders - all offline, no network required
# ============================================================================
# -- Breast Cancer (binary classification) -----------------------------------
def _load_breast_cancer_easy() -> DatasetConfig:
"""Breast cancer - binary classification, clean, 30 numeric features."""
bunch = load_breast_cancer()
df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df["target"] = bunch.target
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.CLASSIFICATION,
)
def _load_breast_cancer_medium() -> DatasetConfig:
"""Breast cancer with ~15 % nulls injected."""
cfg = _load_breast_cancer_easy()
df = _inject_nulls(
cfg.df,
columns=["mean radius", "mean texture", "mean perimeter", "mean area"],
fraction=0.15,
seed=123,
)
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.CLASSIFICATION,
)
def _load_breast_cancer_hard() -> DatasetConfig:
"""Breast cancer with ~25 % nulls + noise columns."""
cfg = _load_breast_cancer_easy()
rng = np.random.RandomState(456)
df = _inject_nulls(
cfg.df,
columns=[c for c in cfg.df.columns if c != "target"],
fraction=0.25,
seed=456,
)
df["noise_a"] = rng.randn(len(df))
df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.CLASSIFICATION,
)
# -- Iris (multi-class classification) ---------------------------------------
def _load_iris_easy() -> DatasetConfig:
"""Iris - 3-class classification, 4 clean numeric features."""
bunch = load_iris()
df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df["species"] = pd.Categorical.from_codes(bunch.target, bunch.target_names)
return DatasetConfig(
df=df,
target_column="species",
problem_type=ProblemType.CLASSIFICATION,
)
# -- Wine (multi-class classification) ---------------------------------------
def _load_wine_easy() -> DatasetConfig:
"""Wine - 3-class classification, 13 numeric features."""
bunch = load_wine()
df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df["quality_class"] = bunch.target
return DatasetConfig(
df=df,
target_column="quality_class",
problem_type=ProblemType.CLASSIFICATION,
)
def _load_wine_medium() -> DatasetConfig:
"""Wine with nulls + a categorical column."""
cfg = _load_wine_easy()
df = _inject_nulls(cfg.df, columns=["alcohol", "ash", "magnesium"], fraction=0.20, seed=321)
df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=321)
return DatasetConfig(
df=df,
target_column="quality_class",
problem_type=ProblemType.CLASSIFICATION,
)
# -- Synthetic classification (scalable) -------------------------------------
def _load_synth_cls_easy() -> DatasetConfig:
"""Synthetic binary classification - 10 features, 500 samples, clean."""
X, y = make_classification(
n_samples=500, n_features=10, n_informative=6,
n_redundant=2, n_classes=2, random_state=42,
)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df["target"] = y
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.CLASSIFICATION,
)
def _load_synth_cls_hard() -> DatasetConfig:
"""Synthetic multi-class - 20 features, 1000 samples, nulls + noise."""
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=4, n_classes=4, n_clusters_per_class=2,
random_state=42,
)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df["target"] = y
df = _inject_nulls(df, columns=["f0", "f3", "f7", "f12"], fraction=0.15, seed=99)
df = _add_categorical_column(df, "group", ["A", "B", "C"], seed=99)
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.CLASSIFICATION,
)
# -- Regression (make_regression based) --------------------------------------
def _load_regression_easy() -> DatasetConfig:
"""Simple regression - 8 features, 400 samples, clean."""
X, y = make_regression(
n_samples=400, n_features=8, n_informative=5,
noise=10.0, random_state=42,
)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df["target"] = y
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
def _load_regression_medium() -> DatasetConfig:
"""Medium regression - 12 features, 600 samples, some nulls."""
X, y = make_regression(
n_samples=600, n_features=12, n_informative=7,
noise=15.0, random_state=42,
)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df["target"] = y
df = _inject_nulls(df, columns=["f1", "f4", "f8"], fraction=0.10, seed=55)
df = _add_categorical_column(df, "category", ["low", "mid", "high"], seed=55)
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
def _load_regression_hard() -> DatasetConfig:
"""Hard regression - 20 features, 1000 samples, heavy nulls + noise cols."""
X, y = make_regression(
n_samples=1000, n_features=20, n_informative=10,
noise=25.0, random_state=42,
)
df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
df["target"] = y
df = _inject_nulls(
df, columns=[f"f{i}" for i in range(0, 20, 3)], fraction=0.20, seed=77
)
rng = np.random.RandomState(77)
df["noise_a"] = rng.randn(len(df))
df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=77)
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
# -- House price (synthetic, realistic column names) -------------------------
def _load_house_price() -> DatasetConfig:
"""Synthetic house-price dataset with realistic column names."""
rng = np.random.RandomState(42)
n = 600
sqft = rng.normal(1800, 400, n).clip(600, 5000)
bedrooms = rng.choice([1, 2, 3, 4, 5], n, p=[0.05, 0.20, 0.40, 0.25, 0.10])
bathrooms = rng.choice([1, 2, 3], n, p=[0.25, 0.50, 0.25])
age = rng.randint(0, 80, n)
garage = rng.choice([0, 1, 2], n, p=[0.2, 0.5, 0.3])
neighborhood = rng.choice(["downtown", "suburb", "rural"], n, p=[0.3, 0.5, 0.2])
price = (
50_000
+ 120 * sqft
+ 15_000 * bedrooms
+ 12_000 * bathrooms
- 800 * age
+ 20_000 * garage
+ rng.normal(0, 25_000, n)
)
df = pd.DataFrame({
"sqft": sqft.astype(int),
"bedrooms": bedrooms,
"bathrooms": bathrooms,
"age": age,
"garage": garage,
"neighborhood": neighborhood,
"price": price.round(0).astype(int),
})
return DatasetConfig(
df=df,
target_column="price",
problem_type=ProblemType.REGRESSION,
)
# -- Diabetes (regression) ---------------------------------------------------
def _load_diabetes_easy() -> DatasetConfig:
"""Diabetes dataset - regression task, 10 numeric features, clean."""
bunch = load_diabetes()
df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
df["target"] = bunch.target
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
def _load_diabetes_medium() -> DatasetConfig:
"""Diabetes with moderate nulls + one categorical feature."""
cfg = _load_diabetes_easy()
df = _inject_nulls(
cfg.df, columns=["bmi", "bp", "s5"], fraction=0.12, seed=123
)
df = _add_categorical_column(df, "sex_group", ["low", "normal", "high"], seed=123)
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
def _load_diabetes_hard() -> DatasetConfig:
"""Diabetes hard - heavy nulls + noise columns."""
cfg = _load_diabetes_easy()
df = _inject_nulls(
cfg.df, columns=list(cfg.df.columns[:-1]), fraction=0.22, seed=456
)
rng = np.random.RandomState(456)
df["noise1"] = rng.randn(len(df))
df["noise2"] = rng.choice(["type_a", "type_b"], size=len(df))
return DatasetConfig(
df=df,
target_column="target",
problem_type=ProblemType.REGRESSION,
)
# -- Digits (multi-class classification) -------------------------------------
def _load_digits_easy() -> DatasetConfig:
"""Handwritten digits - 10-class classification, 64 pixel features."""
bunch = load_digits()
df = pd.DataFrame(bunch.data, columns=[f"pixel_{i}" for i in range(64)])
df["digit"] = bunch.target
return DatasetConfig(
df=df,
target_column="digit",
problem_type=ProblemType.CLASSIFICATION,
)
def _load_digits_medium() -> DatasetConfig:
"""Digits with light nulls (tests imputation on high-dim data)."""
cfg = _load_digits_easy()
df = _inject_nulls(
cfg.df, columns=[f"pixel_{i}" for i in range(0, 64, 8)], fraction=0.08, seed=42
)
return DatasetConfig(
df=df,
target_column="digit",
problem_type=ProblemType.CLASSIFICATION,
)
# -- Linnerud (real-world exercise physiology regression) ---------------------
def _load_linnerud_easy() -> DatasetConfig:
"""Linnerud - predict pulse from exercise and body measurements."""
bunch = load_linnerud()
features = pd.DataFrame(bunch.data, columns=bunch.feature_names)
targets = pd.DataFrame(bunch.target, columns=bunch.target_names)
df = features.copy()
df["pulse"] = targets["Pulse"]
return DatasetConfig(
df=df,
target_column="pulse",
problem_type=ProblemType.REGRESSION,
)
def _load_linnerud_medium() -> DatasetConfig:
"""Linnerud with moderate missingness and one categorical context column."""
cfg = _load_linnerud_easy()
df = _inject_nulls(
cfg.df,
columns=["Chins", "Situps", "Weight", "Waist"],
fraction=0.12,
seed=551,
)
df = _add_categorical_column(
df,
"activity_group",
["beginner", "intermediate", "advanced"],
seed=551,
)
return DatasetConfig(
df=df,
target_column="pulse",
problem_type=ProblemType.REGRESSION,
)
def _load_linnerud_hard() -> DatasetConfig:
"""Linnerud hard mode with heavier nulls and distractor features."""
cfg = _load_linnerud_easy()
df = _inject_nulls(
cfg.df,
columns=[c for c in cfg.df.columns if c != "pulse"],
fraction=0.22,
seed=552,
)
rng = np.random.RandomState(552)
df["noise_a"] = rng.randn(len(df))
df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
return DatasetConfig(
df=df,
target_column="pulse",
problem_type=ProblemType.REGRESSION,
)
# ============================================================================
# Registry - (name, Difficulty | None) -> loader callable
# ============================================================================
_RegistryKey = Tuple[str, Optional[Difficulty]]
_REGISTRY: Dict[_RegistryKey, Callable[[], DatasetConfig]] = {
# -- Classification ------------------------------------------------------
("breast_cancer", Difficulty.EASY): _load_breast_cancer_easy,
("breast_cancer", Difficulty.MEDIUM): _load_breast_cancer_medium,
("breast_cancer", Difficulty.HARD): _load_breast_cancer_hard,
("breast_cancer", None): _load_breast_cancer_easy,
("iris", Difficulty.EASY): _load_iris_easy,
("iris", None): _load_iris_easy,
("wine", Difficulty.EASY): _load_wine_easy,
("wine", Difficulty.MEDIUM): _load_wine_medium,
("wine", None): _load_wine_easy,
("synth_cls", Difficulty.EASY): _load_synth_cls_easy,
("synth_cls", Difficulty.HARD): _load_synth_cls_hard,
("synth_cls", None): _load_synth_cls_easy,
("diabetes", Difficulty.EASY): _load_diabetes_easy,
("diabetes", Difficulty.MEDIUM): _load_diabetes_medium,
("diabetes", Difficulty.HARD): _load_diabetes_hard,
("diabetes", None): _load_diabetes_easy,
("digits", Difficulty.EASY): _load_digits_easy,
("digits", Difficulty.MEDIUM): _load_digits_medium,
("digits", None): _load_digits_easy,
# -- Regression ----------------------------------------------------------
("regression", Difficulty.EASY): _load_regression_easy,
("regression", Difficulty.MEDIUM): _load_regression_medium,
("regression", Difficulty.HARD): _load_regression_hard,
("regression", None): _load_regression_easy,
("house_price", Difficulty.EASY): _load_house_price,
("house_price", None): _load_house_price,
("linnerud", Difficulty.EASY): _load_linnerud_easy,
("linnerud", Difficulty.MEDIUM): _load_linnerud_medium,
("linnerud", Difficulty.HARD): _load_linnerud_hard,
("linnerud", None): _load_linnerud_easy,
}