| """
|
| data/datasets.py
|
| All dataset generation and loading for the ML course.
|
| Each model page imports from here.
|
| """
|
|
|
| import numpy as np
|
| from sklearn.datasets import (
|
| load_diabetes, load_wine, load_linnerud,
|
| load_iris, load_breast_cancer,
|
| make_moons, make_circles, make_blobs,
|
| )
|
| from pydantic import BaseModel
|
| from typing import Optional
|
|
|
|
|
|
|
|
|
| class SyntheticConfig(BaseModel):
|
| dataset_type: str
|
| n_samples: int = 200
|
| noise: float = 0.3
|
| degree: int = 2
|
| frequency: float = 1.0
|
| amplitude: float = 1.0
|
| phase: float = 0.0
|
| slope: float = 2.0
|
| intercept: float = 1.0
|
| outlier_fraction: float = 0.0
|
| random_state: int = 42
|
|
|
|
|
| class RealDatasetConfig(BaseModel):
|
| dataset_name: str
|
|
|
|
|
|
|
|
|
| SYNTHETIC_DATASETS = {"linear", "polynomial", "sinusoidal", "heteroscedastic"}
|
|
|
| REAL_DATASETS = {
|
| "diabetes": {"label": "Diabetes", "dims": "10D", "rows": "442"},
|
| "wine_quality":{"label": "Wine", "dims": "13D", "rows": "178"},
|
| "linnerud": {"label": "Linnerud", "dims": "3D", "rows": "20"},
|
| }
|
|
|
|
|
| def generate_synthetic(cfg: SyntheticConfig):
|
| """Return (X_1d, y) numpy arrays for 2-D synthetic regression datasets."""
|
| rng = np.random.RandomState(cfg.random_state)
|
| X = np.sort(rng.uniform(-3, 3, cfg.n_samples))
|
|
|
| if cfg.dataset_type == "linear":
|
| y = cfg.slope * X + cfg.intercept + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
|
|
|
| elif cfg.dataset_type == "polynomial":
|
| y = sum(X ** i for i in range(1, cfg.degree + 1)) + rng.normal(0, cfg.noise * 2, cfg.n_samples)
|
|
|
| elif cfg.dataset_type == "sinusoidal":
|
| y = cfg.amplitude * np.sin(cfg.frequency * X + cfg.phase) + rng.normal(0, max(cfg.noise, 1e-6), cfg.n_samples)
|
|
|
| elif cfg.dataset_type == "heteroscedastic":
|
| y = cfg.slope * X + rng.normal(0, max(cfg.noise, 1e-6) * (1 + np.abs(X)), cfg.n_samples)
|
|
|
| else:
|
| y = 2.0 * X + rng.normal(0, 0.3, cfg.n_samples)
|
|
|
|
|
| if cfg.outlier_fraction > 0:
|
| n_out = max(1, int(cfg.n_samples * cfg.outlier_fraction))
|
| idx = rng.choice(cfg.n_samples, n_out, replace=False)
|
| y[idx] += rng.choice([-1, 1], n_out) * rng.uniform(4, 8, n_out) * np.std(y)
|
|
|
| return X, y
|
|
|
|
|
| def load_real_dataset(name: str):
|
| """Return (X, y, feature_names) for bundled sklearn datasets."""
|
| if name == "diabetes":
|
| ds = load_diabetes()
|
| return ds.data, ds.target, list(ds.feature_names)
|
|
|
| elif name == "wine_quality":
|
| ds = load_wine()
|
| return ds.data, ds.target.astype(float), list(ds.feature_names)
|
|
|
| elif name == "linnerud":
|
| ds = load_linnerud()
|
|
|
| return ds.data, ds.target[:, 0], list(ds.feature_names)
|
|
|
| else:
|
| raise ValueError(f"Unknown real dataset: '{name}'")
|
|
|
|
|
| def get_dataset_info(name: str) -> dict:
|
| """Return feature names and metadata for a real dataset."""
|
| _, _, feature_names = load_real_dataset(name)
|
| meta = REAL_DATASETS.get(name, {})
|
| return {"features": feature_names, **meta}
|
|
|
|
|
|
|
|
|
| class ClassificationSyntheticConfig(BaseModel):
|
| dataset_type: str
|
| n_samples: int = 300
|
| noise: float = 0.20
|
| n_centers: int = 3
|
| factor: float = 0.5
|
| cluster_std: float = 1.0
|
| random_state: int = 42
|
|
|
|
|
| class ClassificationRealConfig(BaseModel):
|
| dataset_name: str
|
|
|
|
|
| CLASSIFICATION_SYNTHETIC_DATASETS = {"moons", "circles", "blobs"}
|
|
|
| CLASSIFICATION_REAL_DATASETS = {
|
| "iris": {"label": "Iris", "dims": "4D", "rows": "150", "n_classes": 3},
|
| "wine_clf": {"label": "Wine", "dims": "13D", "rows": "178", "n_classes": 3},
|
| "breast_cancer": {"label": "Breast Cancer", "dims": "30D", "rows": "569", "n_classes": 2},
|
| }
|
|
|
|
|
|
|
|
|
| def generate_classification_synthetic(cfg: ClassificationSyntheticConfig):
|
| """Return (X, y, class_names) for 2-D classification datasets."""
|
| if cfg.dataset_type == "moons":
|
| X, y = make_moons(n_samples=cfg.n_samples, noise=cfg.noise,
|
| random_state=cfg.random_state)
|
| class_names = ["Class 0", "Class 1"]
|
|
|
| elif cfg.dataset_type == "circles":
|
| factor = max(0.05, min(0.9, cfg.factor))
|
| X, y = make_circles(n_samples=cfg.n_samples, noise=cfg.noise,
|
| factor=factor, random_state=cfg.random_state)
|
| class_names = ["Inner", "Outer"]
|
|
|
| else:
|
| centers = max(2, min(cfg.n_centers, 5))
|
| X, y = make_blobs(n_samples=cfg.n_samples, centers=centers,
|
| cluster_std=max(0.2, cfg.cluster_std),
|
| random_state=cfg.random_state)
|
| class_names = [f"Blob {i}" for i in range(centers)]
|
|
|
| return X, y.astype(int), class_names
|
|
|
|
|
| def load_classification_real(name: str):
|
| """Return (X, y, feature_names, class_names) for classification datasets."""
|
| if name == "iris":
|
| ds = load_iris()
|
| return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
|
|
|
| elif name == "wine_clf":
|
| ds = load_wine()
|
| class_names = [f"Cultivar {i+1}" for i in range(3)]
|
| return ds.data, ds.target, list(ds.feature_names), class_names
|
|
|
| elif name == "breast_cancer":
|
| ds = load_breast_cancer()
|
| return ds.data, ds.target, list(ds.feature_names), list(ds.target_names)
|
|
|
| else:
|
| raise ValueError(f"Unknown classification dataset: '{name}'")
|
|
|
|
|
| def get_classification_dataset_info(name: str) -> dict:
|
| _, _, feature_names, class_names = load_classification_real(name)
|
| meta = CLASSIFICATION_REAL_DATASETS.get(name, {})
|
| return {"features": feature_names, "classes": class_names, **meta}
|
|
|