File size: 7,079 Bytes
be64da1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | """
data/datasets.py
All dataset generation and loading for the ML course.
Each model page imports from here.
"""
import numpy as np
from sklearn.datasets import (
load_diabetes, load_wine, load_linnerud,
load_iris, load_breast_cancer,
make_moons, make_circles, make_blobs,
)
from pydantic import BaseModel
from typing import Optional
# ββ Pydantic schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class SyntheticConfig(BaseModel):
dataset_type: str # "linear" | "polynomial" | "sinusoidal" | "heteroscedastic"
n_samples: int = 200
noise: float = 0.3
degree: int = 2 # polynomial
frequency: float = 1.0 # sinusoidal
amplitude: float = 1.0 # sinusoidal
phase: float = 0.0 # sinusoidal phase shift
slope: float = 2.0 # linear / heteroscedastic
intercept: float = 1.0
outlier_fraction: float = 0.0
random_state: int = 42
class RealDatasetConfig(BaseModel):
dataset_name: str # "diabetes" | "wine_quality" | "linnerud"
# ββ Synthetic generators ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SYNTHETIC_DATASETS = {"linear", "polynomial", "sinusoidal", "heteroscedastic"}
REAL_DATASETS = {
"diabetes": {"label": "Diabetes", "dims": "10D", "rows": "442"},
"wine_quality":{"label": "Wine", "dims": "13D", "rows": "178"},
"linnerud": {"label": "Linnerud", "dims": "3D", "rows": "20"},
}
def generate_synthetic(cfg: SyntheticConfig):
"""Return (X_1d, y) numpy arrays for 2-D synthetic regression datasets."""
rng = np.random.RandomState(cfg.random_state)
X = np.sort(rng.uniform(-3, 3, cfg.n_samples))
if cfg.dataset_type == "linear":
y = cfg.slope * X + cfg.intercept + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
elif cfg.dataset_type == "polynomial":
y = sum(X ** i for i in range(1, cfg.degree + 1)) + rng.normal(0, cfg.noise * 2, cfg.n_samples)
elif cfg.dataset_type == "sinusoidal":
y = cfg.amplitude * np.sin(cfg.frequency * X + cfg.phase) + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
elif cfg.dataset_type == "heteroscedastic":
y = cfg.slope * X + rng.normal(0, max(cfg.noise, 1e-6) * (1 + np.abs(X)), cfg.n_samples)
else:
y = 2.0 * X + rng.normal(0, 0.3, cfg.n_samples)
# inject outliers
if cfg.outlier_fraction > 0:
n_out = max(1, int(cfg.n_samples * cfg.outlier_fraction))
idx = rng.choice(cfg.n_samples, n_out, replace=False)
y[idx] += rng.choice([-1, 1], n_out) * rng.uniform(4, 8, n_out) * np.std(y)
return X, y
def load_real_dataset(name: str):
"""Return (X, y, feature_names) for bundled sklearn datasets."""
if name == "diabetes":
ds = load_diabetes()
return ds.data, ds.target, list(ds.feature_names)
elif name == "wine_quality":
ds = load_wine()
return ds.data, ds.target.astype(float), list(ds.feature_names)
elif name == "linnerud":
ds = load_linnerud()
# predict Weight (index 0) from exercise features
return ds.data, ds.target[:, 0], list(ds.feature_names)
else:
raise ValueError(f"Unknown real dataset: '{name}'")
def get_dataset_info(name: str) -> dict:
"""Return feature names and metadata for a real dataset."""
_, _, feature_names = load_real_dataset(name)
meta = REAL_DATASETS.get(name, {})
return {"features": feature_names, **meta}
# ββ Classification schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββ
class ClassificationSyntheticConfig(BaseModel):
dataset_type: str # "moons" | "circles" | "blobs"
n_samples: int = 300
noise: float = 0.20
n_centers: int = 3 # blobs only
factor: float = 0.5 # circles: inner/outer radius ratio (0.1β0.8)
cluster_std: float = 1.0 # blobs: cluster standard deviation
random_state: int = 42
class ClassificationRealConfig(BaseModel):
dataset_name: str # "iris" | "wine_clf" | "breast_cancer"
CLASSIFICATION_SYNTHETIC_DATASETS = {"moons", "circles", "blobs"}
CLASSIFICATION_REAL_DATASETS = {
"iris": {"label": "Iris", "dims": "4D", "rows": "150", "n_classes": 3},
"wine_clf": {"label": "Wine", "dims": "13D", "rows": "178", "n_classes": 3},
"breast_cancer": {"label": "Breast Cancer", "dims": "30D", "rows": "569", "n_classes": 2},
}
# ββ Classification generators βββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_classification_synthetic(cfg: ClassificationSyntheticConfig):
"""Return (X, y, class_names) for 2-D classification datasets."""
if cfg.dataset_type == "moons":
X, y = make_moons(n_samples=cfg.n_samples, noise=cfg.noise,
random_state=cfg.random_state)
class_names = ["Class 0", "Class 1"]
elif cfg.dataset_type == "circles":
factor = max(0.05, min(0.9, cfg.factor))
X, y = make_circles(n_samples=cfg.n_samples, noise=cfg.noise,
factor=factor, random_state=cfg.random_state)
class_names = ["Inner", "Outer"]
else: # blobs
centers = max(2, min(cfg.n_centers, 5))
X, y = make_blobs(n_samples=cfg.n_samples, centers=centers,
cluster_std=max(0.2, cfg.cluster_std),
random_state=cfg.random_state)
class_names = [f"Blob {i}" for i in range(centers)]
return X, y.astype(int), class_names
def load_classification_real(name: str):
"""Return (X, y, feature_names, class_names) for classification datasets."""
if name == "iris":
ds = load_iris()
return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
elif name == "wine_clf":
ds = load_wine()
class_names = [f"Cultivar {i+1}" for i in range(3)]
return ds.data, ds.target, list(ds.feature_names), class_names
elif name == "breast_cancer":
ds = load_breast_cancer()
return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
else:
raise ValueError(f"Unknown classification dataset: '{name}'")
def get_classification_dataset_info(name: str) -> dict:
_, _, feature_names, class_names = load_classification_real(name)
meta = CLASSIFICATION_REAL_DATASETS.get(name, {})
return {"features": feature_names, "classes": class_names, **meta}
|