Spaces:
Sleeping
Sleeping
| """ | |
| Data Preparation Script | |
| This script takes a raw dataset, performs a stratified split into | |
| training, validation, and test sets, and saves them to a processed | |
| data directory. This ensures a consistent and reproducible data | |
| splitting strategy. | |
| Usage: | |
| python scripts/prepare_data.py --data-path /path/to/raw/data.csv --output-path data/processed | |
| """ | |
| from pathlib import Path | |
| import argparse | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| def prepare_data(data_path: Path, output_path: Path, test_size: float = 0.2, val_size: float = 0.15): | |
| """ | |
| Loads data, performs stratified train-val-test split, and saves the splits. | |
| Args: | |
| data_path (Path): Path to the raw data file (CSV expected). | |
| output_path (Path): Directory to save the processed data splits. | |
| test_size (float): Proportion of the dataset to include in the test split. | |
| val_size (float): Proportion of the training set to use for validation. | |
| """ | |
| if not data_path.exists(): | |
| raise FileNotFoundError(f"Raw data not found at {data_path}") | |
| print(f"Loading data from {data_path}...") | |
| # This assumes a CSV with a 'spectra' column and a 'label' column. | |
| # You will need to adapt this to your actual raw data format. | |
| df = pd.read_csv(data_path) | |
| # Ensure the 'label' column exists in the dataset | |
| if 'label' not in df.columns: | |
| raise ValueError( | |
| "The input data must contain a 'label' column for stratified splitting.") | |
| # Ensure output directory exists | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| print("Performing stratified train-test split...") | |
| # Split off the test set first | |
| train_val_df, test_df = train_test_split( | |
| df, test_size=test_size, stratify=df['label'], random_state=42 | |
| ) | |
| # Split the remaining data into training and validation sets | |
| train_df, val_df = train_test_split( | |
| train_val_df, test_size=val_size, stratify=train_val_df['label'], random_state=42 | |
| ) | |
| print(f"Train set size: {len(train_df)}") | |
| print(f"Validation set size: {len(val_df)}") | |
| print(f"Test set size: {len(test_df)}") | |
| # Save the splits | |
| train_df.to_csv(output_path / "train.csv", index=False) | |
| val_df.to_csv(output_path / "validation.csv", index=False) | |
| test_df.to_csv(output_path / "test.csv", index=False) | |
| print(f"✅ Data splits saved to {output_path}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Prepare and split spectral data.") | |
| parser.add_argument("--data-path", type=Path, required=True, | |
| help="Path to the raw data CSV file.") | |
| parser.add_argument("--output-path", type=Path, default=Path( | |
| "data/processed"), help="Directory to save data splits.") | |
| args = parser.parse_args() | |
| prepare_data(args.data_path, args.output_path) | |