File size: 3,871 Bytes
18d4089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Time Series Data Augmentation for TFT-ASRO.

Applies conservative augmentation techniques to increase effective training
set size without introducing unrealistic patterns.

Techniques:
    - Jittering: Add small Gaussian noise to feature values
    - Magnitude Warping: Scale features by small random factors
    - Window Slicing: Create shifted sub-windows from the training data

Reference: Um et al. (2017) "Data Augmentation of Wearable Sensor Data" (ICMI)
"""

from __future__ import annotations

import logging

import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)


def jitter(
    df: pd.DataFrame,
    feature_cols: list[str],
    sigma: float = 0.005,
    seed: int = 42,
) -> pd.DataFrame:
    """
    Add Gaussian noise to feature columns.

    The noise magnitude is relative to each feature's standard deviation
    to maintain scale consistency across features with different ranges.
    """
    rng = np.random.RandomState(seed)
    augmented = df.copy()

    for col in feature_cols:
        col_std = augmented[col].std()
        if col_std < 1e-12:
            continue
        noise = rng.normal(0, sigma * col_std, size=len(augmented))
        augmented[col] = augmented[col] + noise

    return augmented


def magnitude_warp(
    df: pd.DataFrame,
    feature_cols: list[str],
    sigma: float = 0.02,
    seed: int = 43,
) -> pd.DataFrame:
    """
    Multiply feature values by smooth random factors centered at 1.0.

    Uses cubic spline interpolation over a few knots to create slowly-varying
    scale factors, preserving local structure.
    """
    from scipy.interpolate import CubicSpline

    rng = np.random.RandomState(seed)
    augmented = df.copy()
    n = len(augmented)
    n_knots = 4
    knot_positions = np.linspace(0, n - 1, n_knots)
    x = np.arange(n)

    for col in feature_cols:
        knot_values = rng.normal(1.0, sigma, size=n_knots)
        cs = CubicSpline(knot_positions, knot_values)
        warp_factor = cs(x)
        augmented[col] = augmented[col] * warp_factor

    return augmented


def augment_training_data(
    df: pd.DataFrame,
    feature_cols: list[str],
    target_col: str = "target",
    augment_ratio: float = 0.15,
    seed: int = 42,
) -> pd.DataFrame:
    """
    Augment training DataFrame with jittered and warped copies.

    Appends augmented rows to the original, preserving time_idx ordering
    by offsetting augmented indices past the original range.

    Args:
        df:             Training DataFrame (must have time_idx and group_id).
        feature_cols:   Feature columns to augment (target is preserved exact).
        augment_ratio:  Fraction of original data to add (0.15 = 15%).
        seed:           Random seed.

    Returns:
        Augmented DataFrame with updated time_idx for new rows.
    """
    n_original = len(df)
    n_augment = int(n_original * augment_ratio)
    if n_augment < 10:
        logger.info("Augmentation: ratio=%.2f yields <10 rows, skipping", augment_ratio)
        return df

    rng = np.random.RandomState(seed)
    sample_idx = rng.choice(n_original, size=n_augment, replace=False)
    sample = df.iloc[sample_idx].copy()

    aug_features = [c for c in feature_cols if c != target_col]

    aug_jitter = jitter(sample, aug_features, sigma=0.005, seed=seed)
    aug_warped = magnitude_warp(aug_jitter, aug_features, sigma=0.02, seed=seed + 1)

    max_time_idx = df["time_idx"].max()
    aug_warped["time_idx"] = np.arange(max_time_idx + 1, max_time_idx + 1 + n_augment)
    aug_warped["group_id"] = "copper_aug"

    combined = pd.concat([df, aug_warped], ignore_index=True)
    combined = combined.sort_values("time_idx").reset_index(drop=True)

    logger.info(
        "Augmentation: added %d rows (%.0f%%) → total %d rows",
        n_augment, augment_ratio * 100, len(combined),
    )
    return combined