ML_course / data /datasets.py
livieris's picture
Upload 15 files
be64da1 verified
"""
data/datasets.py
All dataset generation and loading for the ML course.
Each model page imports from here.
"""
import numpy as np
from sklearn.datasets import (
load_diabetes, load_wine, load_linnerud,
load_iris, load_breast_cancer,
make_moons, make_circles, make_blobs,
)
from pydantic import BaseModel
from typing import Optional
# ── Pydantic schemas ──────────────────────────────────────────────────────────
class SyntheticConfig(BaseModel):
dataset_type: str # "linear" | "polynomial" | "sinusoidal" | "heteroscedastic"
n_samples: int = 200
noise: float = 0.3
degree: int = 2 # polynomial
frequency: float = 1.0 # sinusoidal
amplitude: float = 1.0 # sinusoidal
phase: float = 0.0 # sinusoidal phase shift
slope: float = 2.0 # linear / heteroscedastic
intercept: float = 1.0
outlier_fraction: float = 0.0
random_state: int = 42
class RealDatasetConfig(BaseModel):
dataset_name: str # "diabetes" | "wine_quality" | "linnerud"
# ── Synthetic generators ──────────────────────────────────────────────────────
SYNTHETIC_DATASETS = {"linear", "polynomial", "sinusoidal", "heteroscedastic"}
REAL_DATASETS = {
"diabetes": {"label": "Diabetes", "dims": "10D", "rows": "442"},
"wine_quality":{"label": "Wine", "dims": "13D", "rows": "178"},
"linnerud": {"label": "Linnerud", "dims": "3D", "rows": "20"},
}
def generate_synthetic(cfg: SyntheticConfig):
"""Return (X_1d, y) numpy arrays for 2-D synthetic regression datasets."""
rng = np.random.RandomState(cfg.random_state)
X = np.sort(rng.uniform(-3, 3, cfg.n_samples))
if cfg.dataset_type == "linear":
y = cfg.slope * X + cfg.intercept + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
elif cfg.dataset_type == "polynomial":
y = sum(X ** i for i in range(1, cfg.degree + 1)) + rng.normal(0, cfg.noise * 2, cfg.n_samples)
elif cfg.dataset_type == "sinusoidal":
y = cfg.amplitude * np.sin(cfg.frequency * X + cfg.phase) + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
elif cfg.dataset_type == "heteroscedastic":
y = cfg.slope * X + rng.normal(0, max(cfg.noise, 1e-6) * (1 + np.abs(X)), cfg.n_samples)
else:
y = 2.0 * X + rng.normal(0, 0.3, cfg.n_samples)
# inject outliers
if cfg.outlier_fraction > 0:
n_out = max(1, int(cfg.n_samples * cfg.outlier_fraction))
idx = rng.choice(cfg.n_samples, n_out, replace=False)
y[idx] += rng.choice([-1, 1], n_out) * rng.uniform(4, 8, n_out) * np.std(y)
return X, y
def load_real_dataset(name: str):
"""Return (X, y, feature_names) for bundled sklearn datasets."""
if name == "diabetes":
ds = load_diabetes()
return ds.data, ds.target, list(ds.feature_names)
elif name == "wine_quality":
ds = load_wine()
return ds.data, ds.target.astype(float), list(ds.feature_names)
elif name == "linnerud":
ds = load_linnerud()
# predict Weight (index 0) from exercise features
return ds.data, ds.target[:, 0], list(ds.feature_names)
else:
raise ValueError(f"Unknown real dataset: '{name}'")
def get_dataset_info(name: str) -> dict:
"""Return feature names and metadata for a real dataset."""
_, _, feature_names = load_real_dataset(name)
meta = REAL_DATASETS.get(name, {})
return {"features": feature_names, **meta}
# ── Classification schemas ────────────────────────────────────────────────────
class ClassificationSyntheticConfig(BaseModel):
dataset_type: str # "moons" | "circles" | "blobs"
n_samples: int = 300
noise: float = 0.20
n_centers: int = 3 # blobs only
factor: float = 0.5 # circles: inner/outer radius ratio (0.1–0.8)
cluster_std: float = 1.0 # blobs: cluster standard deviation
random_state: int = 42
class ClassificationRealConfig(BaseModel):
dataset_name: str # "iris" | "wine_clf" | "breast_cancer"
CLASSIFICATION_SYNTHETIC_DATASETS = {"moons", "circles", "blobs"}
CLASSIFICATION_REAL_DATASETS = {
"iris": {"label": "Iris", "dims": "4D", "rows": "150", "n_classes": 3},
"wine_clf": {"label": "Wine", "dims": "13D", "rows": "178", "n_classes": 3},
"breast_cancer": {"label": "Breast Cancer", "dims": "30D", "rows": "569", "n_classes": 2},
}
# ── Classification generators ─────────────────────────────────────────────────
def generate_classification_synthetic(cfg: ClassificationSyntheticConfig):
"""Return (X, y, class_names) for 2-D classification datasets."""
if cfg.dataset_type == "moons":
X, y = make_moons(n_samples=cfg.n_samples, noise=cfg.noise,
random_state=cfg.random_state)
class_names = ["Class 0", "Class 1"]
elif cfg.dataset_type == "circles":
factor = max(0.05, min(0.9, cfg.factor))
X, y = make_circles(n_samples=cfg.n_samples, noise=cfg.noise,
factor=factor, random_state=cfg.random_state)
class_names = ["Inner", "Outer"]
else: # blobs
centers = max(2, min(cfg.n_centers, 5))
X, y = make_blobs(n_samples=cfg.n_samples, centers=centers,
cluster_std=max(0.2, cfg.cluster_std),
random_state=cfg.random_state)
class_names = [f"Blob {i}" for i in range(centers)]
return X, y.astype(int), class_names
def load_classification_real(name: str):
"""Return (X, y, feature_names, class_names) for classification datasets."""
if name == "iris":
ds = load_iris()
return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
elif name == "wine_clf":
ds = load_wine()
class_names = [f"Cultivar {i+1}" for i in range(3)]
return ds.data, ds.target, list(ds.feature_names), class_names
elif name == "breast_cancer":
ds = load_breast_cancer()
return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
else:
raise ValueError(f"Unknown classification dataset: '{name}'")
def get_classification_dataset_info(name: str) -> dict:
_, _, feature_names, class_names = load_classification_real(name)
meta = CLASSIFICATION_REAL_DATASETS.get(name, {})
return {"features": feature_names, "classes": class_names, **meta}