Spaces:
Configuration error
Configuration error
| """ | |
| Train / Validation Split Module | |
| ================================== | |
| Split datasets with configurable ratio, seed, and shuffle. | |
| """ | |
| from dataclasses import dataclass | |
| from typing import Tuple | |
| import pandas as pd | |
| class SplitConfig: | |
| """Configuration for train/validation split.""" | |
| enabled: bool = True | |
| train_ratio: float = 0.8 # e.g., 0.8 means 80% train, 20% val | |
| random_seed: int = 42 | |
| shuffle: bool = True | |
| def split_dataset( | |
| df: pd.DataFrame, | |
| config: SplitConfig, | |
| ) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Split DataFrame into train and validation sets. | |
| Returns: | |
| (train_df, val_df) tuple | |
| """ | |
| if not config.enabled: | |
| return df, pd.DataFrame(columns=df.columns) | |
| if config.shuffle: | |
| df = df.sample(frac=1, random_state=config.random_seed).reset_index(drop=True) | |
| split_idx = int(len(df) * config.train_ratio) | |
| train_df = df.iloc[:split_idx].reset_index(drop=True) | |
| val_df = df.iloc[split_idx:].reset_index(drop=True) | |
| return train_df, val_df | |