Spaces:
Running
Running
File size: 4,365 Bytes
e17f3ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
Dataset Downloader
==================
Downloads real datasets into datasets/ as <name>_X.csv and <name>_y.csv pairs.
Sources:
- sklearn built-ins (iris, breast_cancer, diabetes, wine, digits)
- OpenML (titanic, adult, credit-g)
Run from repo root:
python scripts/download_datasets.py
"""
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
OUT_DIR = Path(__file__).parent.parent / "datasets"
OUT_DIR.mkdir(parents=True, exist_ok=True)
def save(name, X, y):
x_path = OUT_DIR / f"{name}_X.csv"
y_path = OUT_DIR / f"{name}_y.csv"
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
if isinstance(y, np.ndarray):
y = pd.Series(y, name="target")
X.to_csv(x_path, index=False)
y.to_csv(y_path, index=False)
print(f" [OK] {name:30s} {X.shape[0]:>5} rows x {X.shape[1]:>3} cols -> datasets/")
def load_sklearn_datasets():
from sklearn import datasets
print("\n[1/2] Downloading sklearn built-in datasets...")
# Iris β 3-class classification
d = datasets.load_iris(as_frame=True)
save("iris", d.data, d.target)
# Breast Cancer β binary classification
d = datasets.load_breast_cancer(as_frame=True)
save("breast_cancer", d.data, d.target)
# Diabetes β regression
d = datasets.load_diabetes(as_frame=True)
save("diabetes", d.data, d.target)
# Wine β 3-class classification
d = datasets.load_wine(as_frame=True)
save("wine", d.data, d.target)
# Digits β 10-class classification (flatten 8x8 images)
d = datasets.load_digits(as_frame=True)
save("digits", d.data, d.target)
def load_openml_datasets():
print("\n[2/2] Downloading OpenML datasets...")
try:
from sklearn.datasets import fetch_openml
# Titanic β binary classification
try:
d = fetch_openml("titanic", version=1, as_frame=True, parser="auto")
X = d.data.select_dtypes(include=[np.number]).fillna(0)
y = (d.target.astype(str).str.strip() == "1").astype(int)
save("titanic", X, y)
except Exception as e:
print(f" [SKIP] titanic: {e}")
# Credit-G β binary classification
try:
d = fetch_openml("credit-g", version=1, as_frame=True, parser="auto")
X = d.data.copy()
# encode categoricals
for col in X.select_dtypes(include="category").columns:
X[col] = X[col].cat.codes
for col in X.select_dtypes(include="object").columns:
X[col] = X[col].astype("category").cat.codes
y = (d.target.astype(str).str.strip() == "good").astype(int)
save("credit_g", X, y)
except Exception as e:
print(f" [SKIP] credit-g: {e}")
# California Housing β regression
try:
d = fetch_openml("house_prices", version=1, as_frame=True, parser="auto")
X = d.data.select_dtypes(include=[np.number]).fillna(0)
y = d.target.astype(float)
save("house_prices", X, y)
except Exception as e:
print(f" [SKIP] house_prices: {e}")
except ImportError:
print(" [SKIP] OpenML requires scikit-learn>=0.22 and internet access")
def print_summary():
files = sorted(OUT_DIR.glob("*_X.csv"))
print(f"\n{'='*55}")
print(f" {len(files)} dataset(s) ready in datasets/")
print(f"{'='*55}")
for f in files:
name = f.stem.replace("_X", "")
rows = sum(1 for _ in open(f)) - 1
cols = len(open(f).readline().split(","))
y_file = OUT_DIR / f"{name}_y.csv"
# count unique targets
try:
uniq = pd.read_csv(y_file).iloc[:, 0].nunique()
task = "classification" if uniq < 20 else "regression"
except Exception:
task = "?"
print(f" {name:30s} {rows:>5} rows {cols:>3} feat [{task}]")
print(f"\nRun an experiment with:")
print(f" cd code")
for f in files[:3]:
name = f.stem.replace("_X", "")
print(f" python -m runners.run_experiment --dataset {name} --model xgboost")
if __name__ == "__main__":
print("="*55)
print(" SAP RPT-1 Benchmarking β Dataset Downloader")
print("="*55)
load_sklearn_datasets()
load_openml_datasets()
print_summary()
|