File size: 3,027 Bytes
fede53c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Data management with train / validation / test splits.

Provides the ``DataSplit`` dataclass that partitions a dataset into
three non-overlapping subsets with proper isolation:

* **train** : full features + target (visible to the agent)
* **val** : features only (target hidden, used by ``evaluate()``)
* **test** : features only (target hidden, used on ``Submit``)
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, List

import pandas as pd
from sklearn.model_selection import train_test_split

if TYPE_CHECKING:
    from .datasets import DatasetConfig

@dataclass
class DataSplit:
    """A partitioned dataset with proper isolation.

    Agent-visible
    -------------
    * ``train_df``      : training data with features **and** target
    * ``val_features``  : validation features (no target)
    * ``test_features`` : test features (no target)

    Hidden (used internally by the environment)
    -------------------------------------------
    * ``val_labels``    : validation targets (for ``evaluate()``)
    * ``test_labels``   : test targets (for final scoring on ``Submit``)
    """

    train_df: pd.DataFrame
    val_features: pd.DataFrame
    val_labels: pd.Series
    test_features: pd.DataFrame
    test_labels: pd.Series


def create_data_split(
    config: "DatasetConfig",
    *,
    val_ratio: float = 0.15,
    test_ratio: float = 0.15,
    seed: int = 42,
) -> DataSplit:
    """Split a :class:`DatasetConfig` into train / val / test.

    Parameters
    ----------
    config
        Dataset with ``df``, ``target_column``, ``feature_columns``.
    val_ratio
        Fraction of data reserved for validation.
    test_ratio
        Fraction of data reserved for the hidden test set.
    seed
        Random seed for reproducible splits.

    Returns
    -------
    DataSplit
    """
    df = config.df.copy()
    target = config.target_column

    # 1) Separate test set
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_ratio,
        random_state=seed,
        shuffle=True,
    )

    # 2) Separate validation from remaining train
    relative_val = val_ratio / (1.0 - test_ratio)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=relative_val,
        random_state=seed,
        shuffle=True,
    )

    # Reset indices for cleanliness
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Determine feature columns present in the data
    feature_cols: List[str] = [
        c for c in config.feature_columns if c in df.columns
    ]

    return DataSplit(
        train_df=train_df,                       # full (features + target)
        val_features=val_df[feature_cols],       # features only
        val_labels=val_df[target],               # hidden
        test_features=test_df[feature_cols],     # features only
        test_labels=test_df[target],             # hidden
    )