Spaces:
Sleeping
Sleeping
| """Dataset loaders for AutoMLOps demo.""" | |
| import numpy as np | |
| from sklearn.datasets import ( | |
| load_iris, load_wine, load_breast_cancer, load_digits, | |
| load_diabetes, fetch_california_housing | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| DATASETS = { | |
| # ββ Classification ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "Iris Flowers": { | |
| "task": "classification", | |
| "description": "Classic 3-class flower species classification (150 samples, 4 features)", | |
| "loader": load_iris, | |
| "icon": "πΈ", | |
| "difficulty": "Easy", | |
| }, | |
| "Wine Quality": { | |
| "task": "classification", | |
| "description": "Wine cultivar identification from chemical analysis (178 samples, 13 features)", | |
| "loader": load_wine, | |
| "icon": "π·", | |
| "difficulty": "Easy", | |
| }, | |
| "Breast Cancer": { | |
| "task": "classification", | |
| "description": "Tumour malignancy detection (569 samples, 30 features)", | |
| "loader": load_breast_cancer, | |
| "icon": "π¬", | |
| "difficulty": "Medium", | |
| }, | |
| "Handwritten Digits": { | |
| "task": "classification", | |
| "description": "Digit recognition 0-9 from pixel images (1797 samples, 64 features)", | |
| "loader": load_digits, | |
| "icon": "βοΈ", | |
| "difficulty": "Medium", | |
| }, | |
| # ββ Regression ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "Diabetes Progression": { | |
| "task": "regression", | |
| "description": "Disease progression prediction from physiological measurements (442 samples, 10 features)", | |
| "loader": load_diabetes, | |
| "icon": "π", | |
| "difficulty": "Medium", | |
| }, | |
| "California Housing": { | |
| "task": "regression", | |
| "description": "House price prediction from socio-economic data (20640 samples, 8 features)", | |
| "loader": fetch_california_housing, | |
| "icon": "π ", | |
| "difficulty": "Hard", | |
| }, | |
| } | |
| def load_dataset(name: str, test_size: float = 0.2, random_state: int = 42): | |
| """Load a dataset and return train/test splits with metadata.""" | |
| if name not in DATASETS: | |
| raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASETS.keys())}") | |
| cfg = DATASETS[name] | |
| data = cfg["loader"]() | |
| X, y = data.data, data.target | |
| feature_names = ( | |
| list(data.feature_names) if hasattr(data, "feature_names") else | |
| [f"feature_{i}" for i in range(X.shape[1])] | |
| ) | |
| target_names = ( | |
| list(data.target_names) if hasattr(data, "target_names") else None | |
| ) | |
| # Stratify classification splits so each class is proportionally | |
| # represented in the test set β avoids overly easy / hard partitions. | |
| stratify = y if cfg["task"] == "classification" else None | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=random_state, stratify=stratify | |
| ) | |
| metadata = { | |
| "name": name, | |
| "task": cfg["task"], | |
| "description": cfg["description"], | |
| "icon": cfg["icon"], | |
| "difficulty": cfg["difficulty"], | |
| "n_samples": X.shape[0], | |
| "n_features": X.shape[1], | |
| "n_train": X_train.shape[0], | |
| "n_test": X_test.shape[0], | |
| "feature_names": feature_names, | |
| "target_names": target_names, | |
| "n_classes": len(np.unique(y)) if cfg["task"] == "classification" else None, | |
| } | |
| return X_train, X_test, y_train, y_test, metadata | |