"""Data loading with a fast parquet cache and a leakage-safe stratified split."""
from __future__ import annotations

import pandas as pd
from sklearn.model_selection import train_test_split

from src import config


def load_raw(use_cache: bool = True) -> pd.DataFrame:
    """Load the full dataset. Caches to parquet on first load for speed.

    The CSV's leading unnamed column is an index; we drop it.
    """
    if use_cache and config.CACHE_PARQUET.exists():
        return pd.read_parquet(config.CACHE_PARQUET)

    df = pd.read_csv(config.RAW_CSV)
    # The first column is an unnamed row index.
    first = df.columns[0]
    if first.startswith("Unnamed") or first == "":
        df = df.drop(columns=[first])
    df.to_parquet(config.CACHE_PARQUET)
    return df


def split_xy(df: pd.DataFrame):
    """Return (X, y) with the target separated."""
    y = df[config.TARGET].astype(int)
    X = df.drop(columns=[config.TARGET])
    return X, y


def train_test(df: pd.DataFrame):
    """Stratified train/test split. The test set is a true holdout used only
    for final evaluation and to feed the live simulator."""
    X, y = split_xy(df)
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y,
        test_size=config.TEST_SIZE,
        stratify=y,
        random_state=config.SEED,
    )
    return X_tr, X_te, y_tr, y_te