""" data/datasets.py All dataset generation and loading for the ML course. Each model page imports from here. """ import numpy as np from sklearn.datasets import ( load_diabetes, load_wine, load_linnerud, load_iris, load_breast_cancer, make_moons, make_circles, make_blobs, ) from pydantic import BaseModel from typing import Optional # ── Pydantic schemas ────────────────────────────────────────────────────────── class SyntheticConfig(BaseModel): dataset_type: str # "linear" | "polynomial" | "sinusoidal" | "heteroscedastic" n_samples: int = 200 noise: float = 0.3 degree: int = 2 # polynomial frequency: float = 1.0 # sinusoidal amplitude: float = 1.0 # sinusoidal phase: float = 0.0 # sinusoidal phase shift slope: float = 2.0 # linear / heteroscedastic intercept: float = 1.0 outlier_fraction: float = 0.0 random_state: int = 42 class RealDatasetConfig(BaseModel): dataset_name: str # "diabetes" | "wine_quality" | "linnerud" # ── Synthetic generators ────────────────────────────────────────────────────── SYNTHETIC_DATASETS = {"linear", "polynomial", "sinusoidal", "heteroscedastic"} REAL_DATASETS = { "diabetes": {"label": "Diabetes", "dims": "10D", "rows": "442"}, "wine_quality":{"label": "Wine", "dims": "13D", "rows": "178"}, "linnerud": {"label": "Linnerud", "dims": "3D", "rows": "20"}, } def generate_synthetic(cfg: SyntheticConfig): """Return (X_1d, y) numpy arrays for 2-D synthetic regression datasets.""" rng = np.random.RandomState(cfg.random_state) X = np.sort(rng.uniform(-3, 3, cfg.n_samples)) if cfg.dataset_type == "linear": y = cfg.slope * X + cfg.intercept + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples) elif cfg.dataset_type == "polynomial": y = sum(X ** i for i in range(1, cfg.degree + 1)) + rng.normal(0, cfg.noise * 2, cfg.n_samples) elif cfg.dataset_type == "sinusoidal": y = cfg.amplitude * np.sin(cfg.frequency * X + cfg.phase) + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples) elif cfg.dataset_type == "heteroscedastic": y = cfg.slope * X + rng.normal(0, max(cfg.noise, 1e-6) * (1 + np.abs(X)), cfg.n_samples) else: y = 2.0 * X + rng.normal(0, 0.3, cfg.n_samples) # inject outliers if cfg.outlier_fraction > 0: n_out = max(1, int(cfg.n_samples * cfg.outlier_fraction)) idx = rng.choice(cfg.n_samples, n_out, replace=False) y[idx] += rng.choice([-1, 1], n_out) * rng.uniform(4, 8, n_out) * np.std(y) return X, y def load_real_dataset(name: str): """Return (X, y, feature_names) for bundled sklearn datasets.""" if name == "diabetes": ds = load_diabetes() return ds.data, ds.target, list(ds.feature_names) elif name == "wine_quality": ds = load_wine() return ds.data, ds.target.astype(float), list(ds.feature_names) elif name == "linnerud": ds = load_linnerud() # predict Weight (index 0) from exercise features return ds.data, ds.target[:, 0], list(ds.feature_names) else: raise ValueError(f"Unknown real dataset: '{name}'") def get_dataset_info(name: str) -> dict: """Return feature names and metadata for a real dataset.""" _, _, feature_names = load_real_dataset(name) meta = REAL_DATASETS.get(name, {}) return {"features": feature_names, **meta} # ── Classification schemas ──────────────────────────────────────────────────── class ClassificationSyntheticConfig(BaseModel): dataset_type: str # "moons" | "circles" | "blobs" n_samples: int = 300 noise: float = 0.20 n_centers: int = 3 # blobs only factor: float = 0.5 # circles: inner/outer radius ratio (0.1–0.8) cluster_std: float = 1.0 # blobs: cluster standard deviation random_state: int = 42 class ClassificationRealConfig(BaseModel): dataset_name: str # "iris" | "wine_clf" | "breast_cancer" CLASSIFICATION_SYNTHETIC_DATASETS = {"moons", "circles", "blobs"} CLASSIFICATION_REAL_DATASETS = { "iris": {"label": "Iris", "dims": "4D", "rows": "150", "n_classes": 3}, "wine_clf": {"label": "Wine", "dims": "13D", "rows": "178", "n_classes": 3}, "breast_cancer": {"label": "Breast Cancer", "dims": "30D", "rows": "569", "n_classes": 2}, } # ── Classification generators ───────────────────────────────────────────────── def generate_classification_synthetic(cfg: ClassificationSyntheticConfig): """Return (X, y, class_names) for 2-D classification datasets.""" if cfg.dataset_type == "moons": X, y = make_moons(n_samples=cfg.n_samples, noise=cfg.noise, random_state=cfg.random_state) class_names = ["Class 0", "Class 1"] elif cfg.dataset_type == "circles": factor = max(0.05, min(0.9, cfg.factor)) X, y = make_circles(n_samples=cfg.n_samples, noise=cfg.noise, factor=factor, random_state=cfg.random_state) class_names = ["Inner", "Outer"] else: # blobs centers = max(2, min(cfg.n_centers, 5)) X, y = make_blobs(n_samples=cfg.n_samples, centers=centers, cluster_std=max(0.2, cfg.cluster_std), random_state=cfg.random_state) class_names = [f"Blob {i}" for i in range(centers)] return X, y.astype(int), class_names def load_classification_real(name: str): """Return (X, y, feature_names, class_names) for classification datasets.""" if name == "iris": ds = load_iris() return ds.data, ds.target, list(ds.feature_names), list(ds.target_names) elif name == "wine_clf": ds = load_wine() class_names = [f"Cultivar {i+1}" for i in range(3)] return ds.data, ds.target, list(ds.feature_names), class_names elif name == "breast_cancer": ds = load_breast_cancer() return ds.data, ds.target, list(ds.feature_names), list(ds.target_names) else: raise ValueError(f"Unknown classification dataset: '{name}'") def get_classification_dataset_info(name: str) -> dict: _, _, feature_names, class_names = load_classification_real(name) meta = CLASSIFICATION_REAL_DATASETS.get(name, {}) return {"features": feature_names, "classes": class_names, **meta}