Spaces:

simeetnayan
/

odse

Running

App Files Files Community

odse / core /data /datasets.py

simeetnayan

Upload folder using huggingface_hub

4e680fd verified 28 days ago

raw

history blame contribute delete

19.6 kB

	"""Dataset registry for the ODSE sandbox environment.

	All datasets are generated in-process using ``sklearn.datasets`` and
	plain ``numpy`` / ``pandas`` - no network downloads required.

	Each dataset bundles a DataFrame with metadata (problem type, target column,
	feature columns). Datasets are keyed by ``(name, difficulty)`` and loaded
	lazily via factory functions.

	Adding a new dataset
	--------------------
	1. Write a loader function that returns ``DatasetConfig``.
	2. Add an entry to ``_REGISTRY`` at the bottom of this file.
	"""

	from __future__ import annotations

	from typing import Callable, Dict, List, Optional, Tuple

	import numpy as np
	import pandas as pd
	from sklearn.datasets import (
	load_breast_cancer,
	load_iris,
	load_wine,
	load_diabetes,
	load_digits,
	load_linnerud,
	make_classification,
	make_regression,
	)

	from ..models import Difficulty, ProblemType

	# ============================================================================
	# DatasetConfig
	# ============================================================================

	class DatasetConfig:
	"""Bundles a DataFrame with modelling metadata.

	Parameters
	----------
	df : pd.DataFrame
	The raw dataset.
	target_column : str
	Name of the target column.
	problem_type : ProblemType
	Classification or regression.
	problem_description : str
	Human-readable objective for the dataset domain problem.
	feature_columns : List[str] \| None
	Explicit feature list; if None, all non-target / non-excluded
	columns are used.
	exclude_columns : List[str] \| None
	Columns to exclude from features (IDs, free text, ...).
	"""

	def __init__(
	self,
	df: pd.DataFrame,
	target_column: str,
	problem_type: ProblemType,
	problem_description: str = "",
	feature_columns: Optional[List[str]] = None,
	exclude_columns: Optional[List[str]] = None,
	) -> None:
	self.df = df
	self.target_column = target_column
	self.problem_type = problem_type
	self.problem_description = problem_description
	self.exclude_columns = exclude_columns or []
	self.feature_columns: List[str] = feature_columns or [
	c
	for c in df.columns
	if c != target_column and c not in self.exclude_columns
	]

	# ============================================================================
	# Public API
	# ============================================================================

	def load_dataset(
	name: str,
	difficulty: Difficulty \| str = Difficulty.EASY,
	) -> DatasetConfig:
	"""Load a dataset by name and difficulty.

	Falls back to a difficulty-agnostic entry if the exact key is missing.
	Raises ``ValueError`` when no match is found.
	"""
	if isinstance(difficulty, str):
	difficulty = Difficulty(difficulty)

	key: _RegistryKey = (name, difficulty)
	loader = _REGISTRY.get(key)

	if loader is None:
	# Fall back to difficulty-agnostic entry
	loader = _REGISTRY.get((name, None))

	if loader is None:
	available = sorted([k[0] for k in _REGISTRY])
	raise ValueError(
	f"Unknown dataset '{name}'. Available: {available}"
	)
	cfg = loader()
	if not cfg.problem_description:
	cfg.problem_description = _default_problem_description(name, cfg.problem_type)
	return cfg

	def list_datasets() -> List[Dict[str, str]]:
	"""Return a summary of all registered datasets."""
	datasets: Dict[str, List[str]] = {}
	for name, diff in _REGISTRY:
	datasets.setdefault(name, [])
	if diff is not None:
	datasets[name].append(diff.value)
	return [
	{"name": n, "difficulties": sorted(d)} for n, d in datasets.items()
	]


	def _default_problem_description(name: str, problem_type: ProblemType) -> str:
	"""Return a default domain-aware objective for name."""
	descriptions: Dict[str, str] = {
	"breast_cancer": (
	"Predict whether a tumor is malignant or benign from cell-nuclei measurements."
	),
	"iris": (
	"Classify iris flowers into species using sepal and petal measurements."
	),
	"wine": (
	"Predict wine cultivar class from physicochemical properties."
	),
	"synth_cls": (
	"Predict the class label from synthetic tabular features."
	),
	"regression": (
	"Predict a continuous target value from synthetic tabular features."
	),
	"house_price": (
	"Estimate house sale price from property attributes and neighborhood context."
	),
	"diabetes": (
	"Predict quantitative diabetes progression from baseline clinical measurements."
	),
	"digits": (
	"Classify handwritten digit images based on pixel-intensity features."
	),
	"linnerud": (
	"Predict pulse rate from physiological exercise measurements."
	),
	}
	return descriptions.get(
	name,
	(
	"Predict the target column from available features."
	if problem_type == ProblemType.REGRESSION
	else "Classify each example into the correct target class."
	),
	)

	# ============================================================================
	# Helpers
	# ============================================================================

	def _inject_nulls(
	df: pd.DataFrame,
	columns: List[str],
	fraction: float,
	seed: int,
	) -> pd.DataFrame:
	"""Inject NaN into columns at the given fraction."""
	rng = np.random.RandomState(seed)
	df = df.copy()
	for col in columns:
	if col in df.columns:
	mask = rng.rand(len(df)) < fraction
	df.loc[mask, col] = np.nan
	return df

	def _add_categorical_column(
	df: pd.DataFrame,
	col_name: str,
	categories: List[str],
	seed: int,
	) -> pd.DataFrame:
	"""Add a random categorical column to df."""
	rng = np.random.RandomState(seed)
	df = df.copy()
	df[col_name] = rng.choice(categories, size=len(df))
	return df

	# ============================================================================
	# Dataset Loaders - all offline, no network required
	# ============================================================================

	# -- Breast Cancer (binary classification) -----------------------------------

	def _load_breast_cancer_easy() -> DatasetConfig:
	"""Breast cancer - binary classification, clean, 30 numeric features."""
	bunch = load_breast_cancer()
	df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	df["target"] = bunch.target
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.CLASSIFICATION,
	)

	def _load_breast_cancer_medium() -> DatasetConfig:
	"""Breast cancer with ~15 % nulls injected."""
	cfg = _load_breast_cancer_easy()
	df = _inject_nulls(
	cfg.df,
	columns=["mean radius", "mean texture", "mean perimeter", "mean area"],
	fraction=0.15,
	seed=123,
	)
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.CLASSIFICATION,
	)

	def _load_breast_cancer_hard() -> DatasetConfig:
	"""Breast cancer with ~25 % nulls + noise columns."""
	cfg = _load_breast_cancer_easy()
	rng = np.random.RandomState(456)
	df = _inject_nulls(
	cfg.df,
	columns=[c for c in cfg.df.columns if c != "target"],
	fraction=0.25,
	seed=456,
	)
	df["noise_a"] = rng.randn(len(df))
	df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.CLASSIFICATION,
	)

	# -- Iris (multi-class classification) ---------------------------------------

	def _load_iris_easy() -> DatasetConfig:
	"""Iris - 3-class classification, 4 clean numeric features."""
	bunch = load_iris()
	df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	df["species"] = pd.Categorical.from_codes(bunch.target, bunch.target_names)
	return DatasetConfig(
	df=df,
	target_column="species",
	problem_type=ProblemType.CLASSIFICATION,
	)

	# -- Wine (multi-class classification) ---------------------------------------

	def _load_wine_easy() -> DatasetConfig:
	"""Wine - 3-class classification, 13 numeric features."""
	bunch = load_wine()
	df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	df["quality_class"] = bunch.target
	return DatasetConfig(
	df=df,
	target_column="quality_class",
	problem_type=ProblemType.CLASSIFICATION,
	)

	def _load_wine_medium() -> DatasetConfig:
	"""Wine with nulls + a categorical column."""
	cfg = _load_wine_easy()
	df = _inject_nulls(cfg.df, columns=["alcohol", "ash", "magnesium"], fraction=0.20, seed=321)
	df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=321)
	return DatasetConfig(
	df=df,
	target_column="quality_class",
	problem_type=ProblemType.CLASSIFICATION,
	)

	# -- Synthetic classification (scalable) -------------------------------------

	def _load_synth_cls_easy() -> DatasetConfig:
	"""Synthetic binary classification - 10 features, 500 samples, clean."""
	X, y = make_classification(
	n_samples=500, n_features=10, n_informative=6,
	n_redundant=2, n_classes=2, random_state=42,
	)
	df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
	df["target"] = y
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.CLASSIFICATION,
	)

	def _load_synth_cls_hard() -> DatasetConfig:
	"""Synthetic multi-class - 20 features, 1000 samples, nulls + noise."""
	X, y = make_classification(
	n_samples=1000, n_features=20, n_informative=10,
	n_redundant=4, n_classes=4, n_clusters_per_class=2,
	random_state=42,
	)
	df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
	df["target"] = y
	df = _inject_nulls(df, columns=["f0", "f3", "f7", "f12"], fraction=0.15, seed=99)
	df = _add_categorical_column(df, "group", ["A", "B", "C"], seed=99)
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.CLASSIFICATION,
	)

	# -- Regression (make_regression based) --------------------------------------

	def _load_regression_easy() -> DatasetConfig:
	"""Simple regression - 8 features, 400 samples, clean."""
	X, y = make_regression(
	n_samples=400, n_features=8, n_informative=5,
	noise=10.0, random_state=42,
	)
	df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
	df["target"] = y
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)

	def _load_regression_medium() -> DatasetConfig:
	"""Medium regression - 12 features, 600 samples, some nulls."""
	X, y = make_regression(
	n_samples=600, n_features=12, n_informative=7,
	noise=15.0, random_state=42,
	)
	df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
	df["target"] = y
	df = _inject_nulls(df, columns=["f1", "f4", "f8"], fraction=0.10, seed=55)
	df = _add_categorical_column(df, "category", ["low", "mid", "high"], seed=55)
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)

	def _load_regression_hard() -> DatasetConfig:
	"""Hard regression - 20 features, 1000 samples, heavy nulls + noise cols."""
	X, y = make_regression(
	n_samples=1000, n_features=20, n_informative=10,
	noise=25.0, random_state=42,
	)
	df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
	df["target"] = y
	df = _inject_nulls(
	df, columns=[f"f{i}" for i in range(0, 20, 3)], fraction=0.20, seed=77
	)

	rng = np.random.RandomState(77)
	df["noise_a"] = rng.randn(len(df))
	df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
	df = _add_categorical_column(df, "region", ["north", "south", "east", "west"], seed=77)
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)

	# -- House price (synthetic, realistic column names) -------------------------

	def _load_house_price() -> DatasetConfig:
	"""Synthetic house-price dataset with realistic column names."""
	rng = np.random.RandomState(42)
	n = 600
	sqft = rng.normal(1800, 400, n).clip(600, 5000)
	bedrooms = rng.choice([1, 2, 3, 4, 5], n, p=[0.05, 0.20, 0.40, 0.25, 0.10])
	bathrooms = rng.choice([1, 2, 3], n, p=[0.25, 0.50, 0.25])
	age = rng.randint(0, 80, n)
	garage = rng.choice([0, 1, 2], n, p=[0.2, 0.5, 0.3])
	neighborhood = rng.choice(["downtown", "suburb", "rural"], n, p=[0.3, 0.5, 0.2])

	price = (
	50_000
	+ 120 * sqft
	+ 15_000 * bedrooms
	+ 12_000 * bathrooms
	- 800 * age
	+ 20_000 * garage
	+ rng.normal(0, 25_000, n)
	)

	df = pd.DataFrame({
	"sqft": sqft.astype(int),
	"bedrooms": bedrooms,
	"bathrooms": bathrooms,
	"age": age,
	"garage": garage,
	"neighborhood": neighborhood,
	"price": price.round(0).astype(int),
	})
	return DatasetConfig(
	df=df,
	target_column="price",
	problem_type=ProblemType.REGRESSION,
	)

	# -- Diabetes (regression) ---------------------------------------------------
	def _load_diabetes_easy() -> DatasetConfig:
	"""Diabetes dataset - regression task, 10 numeric features, clean."""
	bunch = load_diabetes()
	df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	df["target"] = bunch.target
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)

	def _load_diabetes_medium() -> DatasetConfig:
	"""Diabetes with moderate nulls + one categorical feature."""
	cfg = _load_diabetes_easy()
	df = _inject_nulls(
	cfg.df, columns=["bmi", "bp", "s5"], fraction=0.12, seed=123
	)
	df = _add_categorical_column(df, "sex_group", ["low", "normal", "high"], seed=123)
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)

	def _load_diabetes_hard() -> DatasetConfig:
	"""Diabetes hard - heavy nulls + noise columns."""
	cfg = _load_diabetes_easy()
	df = _inject_nulls(
	cfg.df, columns=list(cfg.df.columns[:-1]), fraction=0.22, seed=456
	)
	rng = np.random.RandomState(456)
	df["noise1"] = rng.randn(len(df))
	df["noise2"] = rng.choice(["type_a", "type_b"], size=len(df))
	return DatasetConfig(
	df=df,
	target_column="target",
	problem_type=ProblemType.REGRESSION,
	)


	# -- Digits (multi-class classification) -------------------------------------
	def _load_digits_easy() -> DatasetConfig:
	"""Handwritten digits - 10-class classification, 64 pixel features."""
	bunch = load_digits()
	df = pd.DataFrame(bunch.data, columns=[f"pixel_{i}" for i in range(64)])
	df["digit"] = bunch.target
	return DatasetConfig(
	df=df,
	target_column="digit",
	problem_type=ProblemType.CLASSIFICATION,
	)

	def _load_digits_medium() -> DatasetConfig:
	"""Digits with light nulls (tests imputation on high-dim data)."""
	cfg = _load_digits_easy()
	df = _inject_nulls(
	cfg.df, columns=[f"pixel_{i}" for i in range(0, 64, 8)], fraction=0.08, seed=42
	)
	return DatasetConfig(
	df=df,
	target_column="digit",
	problem_type=ProblemType.CLASSIFICATION,
	)


	# -- Linnerud (real-world exercise physiology regression) ---------------------
	def _load_linnerud_easy() -> DatasetConfig:
	"""Linnerud - predict pulse from exercise and body measurements."""
	bunch = load_linnerud()
	features = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	targets = pd.DataFrame(bunch.target, columns=bunch.target_names)
	df = features.copy()
	df["pulse"] = targets["Pulse"]
	return DatasetConfig(
	df=df,
	target_column="pulse",
	problem_type=ProblemType.REGRESSION,
	)


	def _load_linnerud_medium() -> DatasetConfig:
	"""Linnerud with moderate missingness and one categorical context column."""
	cfg = _load_linnerud_easy()
	df = _inject_nulls(
	cfg.df,
	columns=["Chins", "Situps", "Weight", "Waist"],
	fraction=0.12,
	seed=551,
	)
	df = _add_categorical_column(
	df,
	"activity_group",
	["beginner", "intermediate", "advanced"],
	seed=551,
	)
	return DatasetConfig(
	df=df,
	target_column="pulse",
	problem_type=ProblemType.REGRESSION,
	)


	def _load_linnerud_hard() -> DatasetConfig:
	"""Linnerud hard mode with heavier nulls and distractor features."""
	cfg = _load_linnerud_easy()
	df = _inject_nulls(
	cfg.df,
	columns=[c for c in cfg.df.columns if c != "pulse"],
	fraction=0.22,
	seed=552,
	)
	rng = np.random.RandomState(552)
	df["noise_a"] = rng.randn(len(df))
	df["noise_b"] = rng.choice(["x", "y", "z"], size=len(df))
	return DatasetConfig(
	df=df,
	target_column="pulse",
	problem_type=ProblemType.REGRESSION,
	)

	# ============================================================================
	# Registry - (name, Difficulty \| None) -> loader callable
	# ============================================================================

	_RegistryKey = Tuple[str, Optional[Difficulty]]

	_REGISTRY: Dict[_RegistryKey, Callable[[], DatasetConfig]] = {
	# -- Classification ------------------------------------------------------
	("breast_cancer", Difficulty.EASY): _load_breast_cancer_easy,
	("breast_cancer", Difficulty.MEDIUM): _load_breast_cancer_medium,
	("breast_cancer", Difficulty.HARD): _load_breast_cancer_hard,
	("breast_cancer", None): _load_breast_cancer_easy,
	("iris", Difficulty.EASY): _load_iris_easy,
	("iris", None): _load_iris_easy,
	("wine", Difficulty.EASY): _load_wine_easy,
	("wine", Difficulty.MEDIUM): _load_wine_medium,
	("wine", None): _load_wine_easy,
	("synth_cls", Difficulty.EASY): _load_synth_cls_easy,
	("synth_cls", Difficulty.HARD): _load_synth_cls_hard,
	("synth_cls", None): _load_synth_cls_easy,
	("diabetes", Difficulty.EASY): _load_diabetes_easy,
	("diabetes", Difficulty.MEDIUM): _load_diabetes_medium,
	("diabetes", Difficulty.HARD): _load_diabetes_hard,
	("diabetes", None): _load_diabetes_easy,
	("digits", Difficulty.EASY): _load_digits_easy,
	("digits", Difficulty.MEDIUM): _load_digits_medium,
	("digits", None): _load_digits_easy,
	# -- Regression ----------------------------------------------------------
	("regression", Difficulty.EASY): _load_regression_easy,
	("regression", Difficulty.MEDIUM): _load_regression_medium,
	("regression", Difficulty.HARD): _load_regression_hard,
	("regression", None): _load_regression_easy,
	("house_price", Difficulty.EASY): _load_house_price,
	("house_price", None): _load_house_price,
	("linnerud", Difficulty.EASY): _load_linnerud_easy,
	("linnerud", Difficulty.MEDIUM): _load_linnerud_medium,
	("linnerud", Difficulty.HARD): _load_linnerud_hard,
	("linnerud", None): _load_linnerud_easy,
	}