Spaces:
Sleeping
Sleeping
| """Data loading with a fast parquet cache and a leakage-safe stratified split.""" | |
| from __future__ import annotations | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from src import config | |
| def load_raw(use_cache: bool = True) -> pd.DataFrame: | |
| """Load the full dataset. Caches to parquet on first load for speed. | |
| The CSV's leading unnamed column is an index; we drop it. | |
| """ | |
| if use_cache and config.CACHE_PARQUET.exists(): | |
| return pd.read_parquet(config.CACHE_PARQUET) | |
| df = pd.read_csv(config.RAW_CSV) | |
| # The first column is an unnamed row index. | |
| first = df.columns[0] | |
| if first.startswith("Unnamed") or first == "": | |
| df = df.drop(columns=[first]) | |
| df.to_parquet(config.CACHE_PARQUET) | |
| return df | |
| def split_xy(df: pd.DataFrame): | |
| """Return (X, y) with the target separated.""" | |
| y = df[config.TARGET].astype(int) | |
| X = df.drop(columns=[config.TARGET]) | |
| return X, y | |
| def train_test(df: pd.DataFrame): | |
| """Stratified train/test split. The test set is a true holdout used only | |
| for final evaluation and to feed the live simulator.""" | |
| X, y = split_xy(df) | |
| X_tr, X_te, y_tr, y_te = train_test_split( | |
| X, y, | |
| test_size=config.TEST_SIZE, | |
| stratify=y, | |
| random_state=config.SEED, | |
| ) | |
| return X_tr, X_te, y_tr, y_te | |