File size: 1,089 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""

Train / Validation Split Module

==================================

Split datasets with configurable ratio, seed, and shuffle.

"""

from dataclasses import dataclass
from typing import Tuple
import pandas as pd


@dataclass
class SplitConfig:
    """Configuration for train/validation split."""
    enabled: bool = True
    train_ratio: float = 0.8  # e.g., 0.8 means 80% train, 20% val
    random_seed: int = 42
    shuffle: bool = True


def split_dataset(

    df: pd.DataFrame,

    config: SplitConfig,

) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """

    Split DataFrame into train and validation sets.



    Returns:

        (train_df, val_df) tuple

    """
    if not config.enabled:
        return df, pd.DataFrame(columns=df.columns)

    if config.shuffle:
        df = df.sample(frac=1, random_state=config.random_seed).reset_index(drop=True)

    split_idx = int(len(df) * config.train_ratio)
    train_df = df.iloc[:split_idx].reset_index(drop=True)
    val_df = df.iloc[split_idx:].reset_index(drop=True)

    return train_df, val_df