Auto-FineTune-Ops / preprocessing /train_val_split.py
aneeb15's picture
Initial release of Auto-FineTune-Ops
d4398e6
"""
Train / Validation Split Module
==================================
Split datasets with configurable ratio, seed, and shuffle.
"""
from dataclasses import dataclass
from typing import Tuple
import pandas as pd
@dataclass
class SplitConfig:
"""Configuration for train/validation split."""
enabled: bool = True
train_ratio: float = 0.8 # e.g., 0.8 means 80% train, 20% val
random_seed: int = 42
shuffle: bool = True
def split_dataset(
df: pd.DataFrame,
config: SplitConfig,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split DataFrame into train and validation sets.
Returns:
(train_df, val_df) tuple
"""
if not config.enabled:
return df, pd.DataFrame(columns=df.columns)
if config.shuffle:
df = df.sample(frac=1, random_state=config.random_seed).reset_index(drop=True)
split_idx = int(len(df) * config.train_ratio)
train_df = df.iloc[:split_idx].reset_index(drop=True)
val_df = df.iloc[split_idx:].reset_index(drop=True)
return train_df, val_df