copper-mind / deep_learning /data /augmentation.py
ifieryarrows's picture
Sync from GitHub (tests passed)
18d4089 verified
"""
Time Series Data Augmentation for TFT-ASRO.
Applies conservative augmentation techniques to increase effective training
set size without introducing unrealistic patterns.
Techniques:
- Jittering: Add small Gaussian noise to feature values
- Magnitude Warping: Scale features by small random factors
- Window Slicing: Create shifted sub-windows from the training data
Reference: Um et al. (2017) "Data Augmentation of Wearable Sensor Data" (ICMI)
"""
from __future__ import annotations
import logging
import numpy as np
import pandas as pd
logger = logging.getLogger(__name__)
def jitter(
df: pd.DataFrame,
feature_cols: list[str],
sigma: float = 0.005,
seed: int = 42,
) -> pd.DataFrame:
"""
Add Gaussian noise to feature columns.
The noise magnitude is relative to each feature's standard deviation
to maintain scale consistency across features with different ranges.
"""
rng = np.random.RandomState(seed)
augmented = df.copy()
for col in feature_cols:
col_std = augmented[col].std()
if col_std < 1e-12:
continue
noise = rng.normal(0, sigma * col_std, size=len(augmented))
augmented[col] = augmented[col] + noise
return augmented
def magnitude_warp(
df: pd.DataFrame,
feature_cols: list[str],
sigma: float = 0.02,
seed: int = 43,
) -> pd.DataFrame:
"""
Multiply feature values by smooth random factors centered at 1.0.
Uses cubic spline interpolation over a few knots to create slowly-varying
scale factors, preserving local structure.
"""
from scipy.interpolate import CubicSpline
rng = np.random.RandomState(seed)
augmented = df.copy()
n = len(augmented)
n_knots = 4
knot_positions = np.linspace(0, n - 1, n_knots)
x = np.arange(n)
for col in feature_cols:
knot_values = rng.normal(1.0, sigma, size=n_knots)
cs = CubicSpline(knot_positions, knot_values)
warp_factor = cs(x)
augmented[col] = augmented[col] * warp_factor
return augmented
def augment_training_data(
df: pd.DataFrame,
feature_cols: list[str],
target_col: str = "target",
augment_ratio: float = 0.15,
seed: int = 42,
) -> pd.DataFrame:
"""
Augment training DataFrame with jittered and warped copies.
Appends augmented rows to the original, preserving time_idx ordering
by offsetting augmented indices past the original range.
Args:
df: Training DataFrame (must have time_idx and group_id).
feature_cols: Feature columns to augment (target is preserved exact).
augment_ratio: Fraction of original data to add (0.15 = 15%).
seed: Random seed.
Returns:
Augmented DataFrame with updated time_idx for new rows.
"""
n_original = len(df)
n_augment = int(n_original * augment_ratio)
if n_augment < 10:
logger.info("Augmentation: ratio=%.2f yields <10 rows, skipping", augment_ratio)
return df
rng = np.random.RandomState(seed)
sample_idx = rng.choice(n_original, size=n_augment, replace=False)
sample = df.iloc[sample_idx].copy()
aug_features = [c for c in feature_cols if c != target_col]
aug_jitter = jitter(sample, aug_features, sigma=0.005, seed=seed)
aug_warped = magnitude_warp(aug_jitter, aug_features, sigma=0.02, seed=seed + 1)
max_time_idx = df["time_idx"].max()
aug_warped["time_idx"] = np.arange(max_time_idx + 1, max_time_idx + 1 + n_augment)
aug_warped["group_id"] = "copper_aug"
combined = pd.concat([df, aug_warped], ignore_index=True)
combined = combined.sort_values("time_idx").reset_index(drop=True)
logger.info(
"Augmentation: added %d rows (%.0f%%) → total %d rows",
n_augment, augment_ratio * 100, len(combined),
)
return combined