Spaces:
Running
Running
| """Dataset registry for the ODSE sandbox environment. | |
| All datasets are generated in-process using ``sklearn.datasets`` and | |
| plain ``numpy`` / ``pandas`` - **no network downloads required**. | |
| Each dataset bundles a DataFrame with metadata (problem type, target column, | |
| feature columns). Datasets are keyed by ``(name, difficulty)`` and loaded | |
| lazily via factory functions. | |
| Adding a new dataset | |
| -------------------- | |
| 1. Write a loader function that returns ``DatasetConfig``. | |
| 2. Add an entry to ``_REGISTRY`` at the bottom of this file. | |
| """ | |
| from __future__ import annotations | |
| from typing import Callable, Dict, List, Optional, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.datasets import ( | |
| load_breast_cancer, | |
| load_iris, | |
| load_wine, | |
| load_diabetes, | |
| load_digits, | |
| load_linnerud, | |
| make_classification, | |
| make_regression, | |
| ) | |
| from ..models import Difficulty, ProblemType | |
| # ============================================================================ | |
| # DatasetConfig | |
| # ============================================================================ | |
| class DatasetConfig: | |
| """Bundles a DataFrame with modelling metadata. | |
| Parameters | |
| ---------- | |
| df : pd.DataFrame | |
| The raw dataset. | |
| target_column : str | |
| Name of the target column. | |
| problem_type : ProblemType | |
| Classification or regression. | |
| problem_description : str | |
| Human-readable objective for the dataset domain problem. | |
| feature_columns : List[str] | None | |
| Explicit feature list; if *None*, all non-target / non-excluded | |
| columns are used. | |
| exclude_columns : List[str] | None | |
| Columns to exclude from features (IDs, free text, ...). | |
| """ | |
| def __init__( | |
| self, | |
| df: pd.DataFrame, | |
| target_column: str, | |
| problem_type: ProblemType, | |
| problem_description: str = "", | |
| feature_columns: Optional[List[str]] = None, | |
| exclude_columns: Optional[List[str]] = None, | |
| ) -> None: | |
| self.df = df | |
| self.target_column = target_column | |
| self.problem_type = problem_type | |
| self.problem_description = problem_description | |
| self.exclude_columns = exclude_columns or [] | |
| self.feature_columns: List[str] = feature_columns or [ | |
| c | |
| for c in df.columns | |
| if c != target_column and c not in self.exclude_columns | |
| ] | |
| # ============================================================================ | |
| # Public API | |
| # ============================================================================ | |
| def load_dataset( | |
| name: str, | |
| difficulty: Difficulty | str = Difficulty.EASY, | |
| ) -> DatasetConfig: | |
| """Load a dataset by *name* and *difficulty*. | |
| Falls back to a difficulty-agnostic entry if the exact key is missing. | |
| Raises ``ValueError`` when no match is found. | |
| """ | |
| if isinstance(difficulty, str): | |
| difficulty = Difficulty(difficulty) | |
| key: _RegistryKey = (name, difficulty) | |
| loader = _REGISTRY.get(key) | |
| if loader is None: | |
| # Fall back to difficulty-agnostic entry | |
| loader = _REGISTRY.get((name, None)) | |
| if loader is None: | |
| available = sorted([k[0] for k in _REGISTRY]) | |
| raise ValueError( | |
| f"Unknown dataset '{name}'. Available: {available}" | |
| ) | |
| cfg = loader() | |
| if not cfg.problem_description: | |
| cfg.problem_description = _default_problem_description(name, cfg.problem_type) | |
| return cfg | |
| def list_datasets() -> List[Dict[str, str]]: | |
| """Return a summary of all registered datasets.""" | |
| datasets: Dict[str, List[str]] = {} | |
| for name, diff in _REGISTRY: | |
| datasets.setdefault(name, []) | |
| if diff is not None: | |
| datasets[name].append(diff.value) | |
| return [ | |
| {"name": n, "difficulties": sorted(d)} for n, d in datasets.items() | |
| ] | |
| def _default_problem_description(name: str, problem_type: ProblemType) -> str: | |
| """Return a default domain-aware objective for *name*.""" | |
| descriptions: Dict[str, str] = { | |
| "breast_cancer": ( | |
| "Predict whether a tumor is malignant or benign from cell-nuclei measurements." | |
| ), | |
| "iris": ( | |
| "Classify iris flowers into species using sepal and petal measurements." | |
| ), | |
| "wine": ( | |
| "Predict wine cultivar class from physicochemical properties." | |
| ), | |
| "synth_cls": ( | |
| "Predict the class label from synthetic tabular features." | |
| ), | |
| "regression": ( | |
| "Predict a continuous target value from synthetic tabular features." | |
| ), | |
| "house_price": ( | |
| "Estimate house sale price from property attributes and neighborhood context." | |
| ), | |
| "diabetes": ( | |
| "Predict quantitative diabetes progression from baseline clinical measurements." | |
| ), | |
| "digits": ( | |
| "Classify handwritten digit images based on pixel-intensity features." | |
| ), | |
| "linnerud": ( | |
| "Predict pulse rate from physiological exercise measurements." | |
| ), | |
| } | |
| return descriptions.get( | |
| name, | |
| ( | |
| "Predict the target column from available features." | |
| if problem_type == ProblemType.REGRESSION | |
| else "Classify each example into the correct target class." | |
| ), | |
| ) | |
| # ============================================================================ | |
| # Helpers | |
| # ============================================================================ | |
| def _inject_nulls( | |
| df: pd.DataFrame, | |
| columns: List[str], | |
| fraction: float, | |
| seed: int, | |
| ) -> pd.DataFrame: | |
| """Inject NaN into *columns* at the given *fraction*.""" | |
| rng = np.random.RandomState(seed) | |
| df = df.copy() | |
| for col in columns: | |
| if col in df.columns: | |
| mask = rng.rand(len(df)) < fraction | |
| df.loc[mask, col] = np.nan | |
| return df | |
| def _add_categorical_column( | |
| df: pd.DataFrame, | |
| col_name: str, | |
| categories: List[str], | |
| seed: int, | |
| ) -> pd.DataFrame: | |
| """Add a random categorical column to *df*.""" | |
| rng = np.random.RandomState(seed) | |
| df = df.copy() | |
| df[col_name] = rng.choice(categories, size=len(df)) | |
| return df | |
| # ============================================================================ | |
| # Dataset Loaders - all offline, no network required | |
| # ============================================================================ | |
| # -- Breast Cancer (binary classification) ----------------------------------- | |
| def _load_breast_cancer_easy() -> DatasetConfig: | |
| """Breast cancer - binary classification, clean, 30 numeric features.""" | |
| bunch = load_breast_cancer() | |
| df = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| df["target"] = bunch.target | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| def _load_breast_cancer_medium() -> DatasetConfig: | |
| """Breast cancer with ~15 % nulls injected.""" | |
| cfg = _load_breast_cancer_easy() | |
| df = _inject_nulls( | |
| cfg.df, | |
| columns=["mean radius", "mean texture", "mean perimeter", "mean area"], | |
| fraction=0.15, | |
| seed=123, | |
| ) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| def _load_breast_cancer_hard() -> DatasetConfig: | |
| """Breast cancer with ~25 % nulls + noise columns.""" | |
| cfg = _load_breast_cancer_easy() | |
| rng = np.random.RandomState(456) | |
| df = _inject_nulls( | |
| cfg.df, | |
| columns=[c for c in cfg.df.columns if c != "target"], | |
| fraction=0.25, | |
| seed=456, | |
| ) | |
| df["noise_a"] = rng.randn(len(df)) | |
| df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df)) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| # -- Iris (multi-class classification) --------------------------------------- | |
| def _load_iris_easy() -> DatasetConfig: | |
| """Iris - 3-class classification, 4 clean numeric features.""" | |
| bunch = load_iris() | |
| df = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| df["species"] = pd.Categorical.from_codes(bunch.target, bunch.target_names) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="species", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| # -- Wine (multi-class classification) --------------------------------------- | |
| def _load_wine_easy() -> DatasetConfig: | |
| """Wine - 3-class classification, 13 numeric features.""" | |
| bunch = load_wine() | |
| df = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| df["quality_class"] = bunch.target | |
| return DatasetConfig( | |
| df=df, | |
| target_column="quality_class", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| def _load_wine_medium() -> DatasetConfig: | |
| """Wine with nulls + a categorical column.""" | |
| cfg = _load_wine_easy() | |
| df = _inject_nulls(cfg.df, columns=["alcohol", "ash", "magnesium"], fraction=0.20, seed=321) | |
| df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=321) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="quality_class", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| # -- Synthetic classification (scalable) ------------------------------------- | |
| def _load_synth_cls_easy() -> DatasetConfig: | |
| """Synthetic binary classification - 10 features, 500 samples, clean.""" | |
| X, y = make_classification( | |
| n_samples=500, n_features=10, n_informative=6, | |
| n_redundant=2, n_classes=2, random_state=42, | |
| ) | |
| df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) | |
| df["target"] = y | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| def _load_synth_cls_hard() -> DatasetConfig: | |
| """Synthetic multi-class - 20 features, 1000 samples, nulls + noise.""" | |
| X, y = make_classification( | |
| n_samples=1000, n_features=20, n_informative=10, | |
| n_redundant=4, n_classes=4, n_clusters_per_class=2, | |
| random_state=42, | |
| ) | |
| df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) | |
| df["target"] = y | |
| df = _inject_nulls(df, columns=["f0", "f3", "f7", "f12"], fraction=0.15, seed=99) | |
| df = _add_categorical_column(df, "group", ["A", "B", "C"], seed=99) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| # -- Regression (make_regression based) -------------------------------------- | |
| def _load_regression_easy() -> DatasetConfig: | |
| """Simple regression - 8 features, 400 samples, clean.""" | |
| X, y = make_regression( | |
| n_samples=400, n_features=8, n_informative=5, | |
| noise=10.0, random_state=42, | |
| ) | |
| df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) | |
| df["target"] = y | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_regression_medium() -> DatasetConfig: | |
| """Medium regression - 12 features, 600 samples, some nulls.""" | |
| X, y = make_regression( | |
| n_samples=600, n_features=12, n_informative=7, | |
| noise=15.0, random_state=42, | |
| ) | |
| df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) | |
| df["target"] = y | |
| df = _inject_nulls(df, columns=["f1", "f4", "f8"], fraction=0.10, seed=55) | |
| df = _add_categorical_column(df, "category", ["low", "mid", "high"], seed=55) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_regression_hard() -> DatasetConfig: | |
| """Hard regression - 20 features, 1000 samples, heavy nulls + noise cols.""" | |
| X, y = make_regression( | |
| n_samples=1000, n_features=20, n_informative=10, | |
| noise=25.0, random_state=42, | |
| ) | |
| df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) | |
| df["target"] = y | |
| df = _inject_nulls( | |
| df, columns=[f"f{i}" for i in range(0, 20, 3)], fraction=0.20, seed=77 | |
| ) | |
| rng = np.random.RandomState(77) | |
| df["noise_a"] = rng.randn(len(df)) | |
| df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df)) | |
| df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=77) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| # -- House price (synthetic, realistic column names) ------------------------- | |
| def _load_house_price() -> DatasetConfig: | |
| """Synthetic house-price dataset with realistic column names.""" | |
| rng = np.random.RandomState(42) | |
| n = 600 | |
| sqft = rng.normal(1800, 400, n).clip(600, 5000) | |
| bedrooms = rng.choice([1, 2, 3, 4, 5], n, p=[0.05, 0.20, 0.40, 0.25, 0.10]) | |
| bathrooms = rng.choice([1, 2, 3], n, p=[0.25, 0.50, 0.25]) | |
| age = rng.randint(0, 80, n) | |
| garage = rng.choice([0, 1, 2], n, p=[0.2, 0.5, 0.3]) | |
| neighborhood = rng.choice(["downtown", "suburb", "rural"], n, p=[0.3, 0.5, 0.2]) | |
| price = ( | |
| 50_000 | |
| + 120 * sqft | |
| + 15_000 * bedrooms | |
| + 12_000 * bathrooms | |
| - 800 * age | |
| + 20_000 * garage | |
| + rng.normal(0, 25_000, n) | |
| ) | |
| df = pd.DataFrame({ | |
| "sqft": sqft.astype(int), | |
| "bedrooms": bedrooms, | |
| "bathrooms": bathrooms, | |
| "age": age, | |
| "garage": garage, | |
| "neighborhood": neighborhood, | |
| "price": price.round(0).astype(int), | |
| }) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="price", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| # -- Diabetes (regression) --------------------------------------------------- | |
| def _load_diabetes_easy() -> DatasetConfig: | |
| """Diabetes dataset - regression task, 10 numeric features, clean.""" | |
| bunch = load_diabetes() | |
| df = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| df["target"] = bunch.target | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_diabetes_medium() -> DatasetConfig: | |
| """Diabetes with moderate nulls + one categorical feature.""" | |
| cfg = _load_diabetes_easy() | |
| df = _inject_nulls( | |
| cfg.df, columns=["bmi", "bp", "s5"], fraction=0.12, seed=123 | |
| ) | |
| df = _add_categorical_column(df, "sex_group", ["low", "normal", "high"], seed=123) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_diabetes_hard() -> DatasetConfig: | |
| """Diabetes hard - heavy nulls + noise columns.""" | |
| cfg = _load_diabetes_easy() | |
| df = _inject_nulls( | |
| cfg.df, columns=list(cfg.df.columns[:-1]), fraction=0.22, seed=456 | |
| ) | |
| rng = np.random.RandomState(456) | |
| df["noise1"] = rng.randn(len(df)) | |
| df["noise2"] = rng.choice(["type_a", "type_b"], size=len(df)) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="target", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| # -- Digits (multi-class classification) ------------------------------------- | |
| def _load_digits_easy() -> DatasetConfig: | |
| """Handwritten digits - 10-class classification, 64 pixel features.""" | |
| bunch = load_digits() | |
| df = pd.DataFrame(bunch.data, columns=[f"pixel_{i}" for i in range(64)]) | |
| df["digit"] = bunch.target | |
| return DatasetConfig( | |
| df=df, | |
| target_column="digit", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| def _load_digits_medium() -> DatasetConfig: | |
| """Digits with light nulls (tests imputation on high-dim data).""" | |
| cfg = _load_digits_easy() | |
| df = _inject_nulls( | |
| cfg.df, columns=[f"pixel_{i}" for i in range(0, 64, 8)], fraction=0.08, seed=42 | |
| ) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="digit", | |
| problem_type=ProblemType.CLASSIFICATION, | |
| ) | |
| # -- Linnerud (real-world exercise physiology regression) --------------------- | |
| def _load_linnerud_easy() -> DatasetConfig: | |
| """Linnerud - predict pulse from exercise and body measurements.""" | |
| bunch = load_linnerud() | |
| features = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| targets = pd.DataFrame(bunch.target, columns=bunch.target_names) | |
| df = features.copy() | |
| df["pulse"] = targets["Pulse"] | |
| return DatasetConfig( | |
| df=df, | |
| target_column="pulse", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_linnerud_medium() -> DatasetConfig: | |
| """Linnerud with moderate missingness and one categorical context column.""" | |
| cfg = _load_linnerud_easy() | |
| df = _inject_nulls( | |
| cfg.df, | |
| columns=["Chins", "Situps", "Weight", "Waist"], | |
| fraction=0.12, | |
| seed=551, | |
| ) | |
| df = _add_categorical_column( | |
| df, | |
| "activity_group", | |
| ["beginner", "intermediate", "advanced"], | |
| seed=551, | |
| ) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="pulse", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| def _load_linnerud_hard() -> DatasetConfig: | |
| """Linnerud hard mode with heavier nulls and distractor features.""" | |
| cfg = _load_linnerud_easy() | |
| df = _inject_nulls( | |
| cfg.df, | |
| columns=[c for c in cfg.df.columns if c != "pulse"], | |
| fraction=0.22, | |
| seed=552, | |
| ) | |
| rng = np.random.RandomState(552) | |
| df["noise_a"] = rng.randn(len(df)) | |
| df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df)) | |
| return DatasetConfig( | |
| df=df, | |
| target_column="pulse", | |
| problem_type=ProblemType.REGRESSION, | |
| ) | |
| # ============================================================================ | |
| # Registry - (name, Difficulty | None) -> loader callable | |
| # ============================================================================ | |
| _RegistryKey = Tuple[str, Optional[Difficulty]] | |
| _REGISTRY: Dict[_RegistryKey, Callable[[], DatasetConfig]] = { | |
| # -- Classification ------------------------------------------------------ | |
| ("breast_cancer", Difficulty.EASY): _load_breast_cancer_easy, | |
| ("breast_cancer", Difficulty.MEDIUM): _load_breast_cancer_medium, | |
| ("breast_cancer", Difficulty.HARD): _load_breast_cancer_hard, | |
| ("breast_cancer", None): _load_breast_cancer_easy, | |
| ("iris", Difficulty.EASY): _load_iris_easy, | |
| ("iris", None): _load_iris_easy, | |
| ("wine", Difficulty.EASY): _load_wine_easy, | |
| ("wine", Difficulty.MEDIUM): _load_wine_medium, | |
| ("wine", None): _load_wine_easy, | |
| ("synth_cls", Difficulty.EASY): _load_synth_cls_easy, | |
| ("synth_cls", Difficulty.HARD): _load_synth_cls_hard, | |
| ("synth_cls", None): _load_synth_cls_easy, | |
| ("diabetes", Difficulty.EASY): _load_diabetes_easy, | |
| ("diabetes", Difficulty.MEDIUM): _load_diabetes_medium, | |
| ("diabetes", Difficulty.HARD): _load_diabetes_hard, | |
| ("diabetes", None): _load_diabetes_easy, | |
| ("digits", Difficulty.EASY): _load_digits_easy, | |
| ("digits", Difficulty.MEDIUM): _load_digits_medium, | |
| ("digits", None): _load_digits_easy, | |
| # -- Regression ---------------------------------------------------------- | |
| ("regression", Difficulty.EASY): _load_regression_easy, | |
| ("regression", Difficulty.MEDIUM): _load_regression_medium, | |
| ("regression", Difficulty.HARD): _load_regression_hard, | |
| ("regression", None): _load_regression_easy, | |
| ("house_price", Difficulty.EASY): _load_house_price, | |
| ("house_price", None): _load_house_price, | |
| ("linnerud", Difficulty.EASY): _load_linnerud_easy, | |
| ("linnerud", Difficulty.MEDIUM): _load_linnerud_medium, | |
| ("linnerud", Difficulty.HARD): _load_linnerud_hard, | |
| ("linnerud", None): _load_linnerud_easy, | |
| } |