"""
Data Preparation Script

This script takes a raw dataset, performs a stratified split into
training, validation, and test sets, and saves them to a processed
data directory. This ensures a consistent and reproducible data
splitting strategy.

Usage:
    python scripts/prepare_data.py --data-path /path/to/raw/data.csv --output-path data/processed
"""

from pathlib import Path
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split


def prepare_data(data_path: Path, output_path: Path, test_size: float = 0.2, val_size: float = 0.15):
    """
    Loads data, performs stratified train-val-test split, and saves the splits.

    Args:
        data_path (Path): Path to the raw data file (CSV expected).
        output_path (Path): Directory to save the processed data splits.
        test_size (float): Proportion of the dataset to include in the test split.
        val_size (float): Proportion of the training set to use for validation.
    """
    if not data_path.exists():
        raise FileNotFoundError(f"Raw data not found at {data_path}")

    print(f"Loading data from {data_path}...")
    # This assumes a CSV with a 'spectra' column and a 'label' column.
    # You will need to adapt this to your actual raw data format.
    df = pd.read_csv(data_path)

    # Ensure the 'label' column exists in the dataset
    if 'label' not in df.columns:
        raise ValueError(
            "The input data must contain a 'label' column for stratified splitting.")

    # Ensure output directory exists
    output_path.mkdir(parents=True, exist_ok=True)

    print("Performing stratified train-test split...")
    # Split off the test set first
    train_val_df, test_df = train_test_split(
        df, test_size=test_size, stratify=df['label'], random_state=42
    )

    # Split the remaining data into training and validation sets
    train_df, val_df = train_test_split(
        train_val_df, test_size=val_size, stratify=train_val_df['label'], random_state=42
    )

    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")

    # Save the splits
    train_df.to_csv(output_path / "train.csv", index=False)
    val_df.to_csv(output_path / "validation.csv", index=False)
    test_df.to_csv(output_path / "test.csv", index=False)

    print(f"✅ Data splits saved to {output_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Prepare and split spectral data.")
    parser.add_argument("--data-path", type=Path, required=True,
                        help="Path to the raw data CSV file.")
    parser.add_argument("--output-path", type=Path, default=Path(
        "data/processed"), help="Directory to save data splits.")
    args = parser.parse_args()
    prepare_data(args.data_path, args.output_path)