Spaces:
Running
Running
| """ | |
| Time Series Data Augmentation for TFT-ASRO. | |
| Applies conservative augmentation techniques to increase effective training | |
| set size without introducing unrealistic patterns. | |
| Techniques: | |
| - Jittering: Add small Gaussian noise to feature values | |
| - Magnitude Warping: Scale features by small random factors | |
| - Window Slicing: Create shifted sub-windows from the training data | |
| Reference: Um et al. (2017) "Data Augmentation of Wearable Sensor Data" (ICMI) | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import numpy as np | |
| import pandas as pd | |
| logger = logging.getLogger(__name__) | |
| def jitter( | |
| df: pd.DataFrame, | |
| feature_cols: list[str], | |
| sigma: float = 0.005, | |
| seed: int = 42, | |
| ) -> pd.DataFrame: | |
| """ | |
| Add Gaussian noise to feature columns. | |
| The noise magnitude is relative to each feature's standard deviation | |
| to maintain scale consistency across features with different ranges. | |
| """ | |
| rng = np.random.RandomState(seed) | |
| augmented = df.copy() | |
| for col in feature_cols: | |
| col_std = augmented[col].std() | |
| if col_std < 1e-12: | |
| continue | |
| noise = rng.normal(0, sigma * col_std, size=len(augmented)) | |
| augmented[col] = augmented[col] + noise | |
| return augmented | |
| def magnitude_warp( | |
| df: pd.DataFrame, | |
| feature_cols: list[str], | |
| sigma: float = 0.02, | |
| seed: int = 43, | |
| ) -> pd.DataFrame: | |
| """ | |
| Multiply feature values by smooth random factors centered at 1.0. | |
| Uses cubic spline interpolation over a few knots to create slowly-varying | |
| scale factors, preserving local structure. | |
| """ | |
| from scipy.interpolate import CubicSpline | |
| rng = np.random.RandomState(seed) | |
| augmented = df.copy() | |
| n = len(augmented) | |
| n_knots = 4 | |
| knot_positions = np.linspace(0, n - 1, n_knots) | |
| x = np.arange(n) | |
| for col in feature_cols: | |
| knot_values = rng.normal(1.0, sigma, size=n_knots) | |
| cs = CubicSpline(knot_positions, knot_values) | |
| warp_factor = cs(x) | |
| augmented[col] = augmented[col] * warp_factor | |
| return augmented | |
| def augment_training_data( | |
| df: pd.DataFrame, | |
| feature_cols: list[str], | |
| target_col: str = "target", | |
| augment_ratio: float = 0.15, | |
| seed: int = 42, | |
| ) -> pd.DataFrame: | |
| """ | |
| Augment training DataFrame with jittered and warped copies. | |
| Appends augmented rows to the original, preserving time_idx ordering | |
| by offsetting augmented indices past the original range. | |
| Args: | |
| df: Training DataFrame (must have time_idx and group_id). | |
| feature_cols: Feature columns to augment (target is preserved exact). | |
| augment_ratio: Fraction of original data to add (0.15 = 15%). | |
| seed: Random seed. | |
| Returns: | |
| Augmented DataFrame with updated time_idx for new rows. | |
| """ | |
| n_original = len(df) | |
| n_augment = int(n_original * augment_ratio) | |
| if n_augment < 10: | |
| logger.info("Augmentation: ratio=%.2f yields <10 rows, skipping", augment_ratio) | |
| return df | |
| rng = np.random.RandomState(seed) | |
| sample_idx = rng.choice(n_original, size=n_augment, replace=False) | |
| sample = df.iloc[sample_idx].copy() | |
| aug_features = [c for c in feature_cols if c != target_col] | |
| aug_jitter = jitter(sample, aug_features, sigma=0.005, seed=seed) | |
| aug_warped = magnitude_warp(aug_jitter, aug_features, sigma=0.02, seed=seed + 1) | |
| max_time_idx = df["time_idx"].max() | |
| aug_warped["time_idx"] = np.arange(max_time_idx + 1, max_time_idx + 1 + n_augment) | |
| aug_warped["group_id"] = "copper_aug" | |
| combined = pd.concat([df, aug_warped], ignore_index=True) | |
| combined = combined.sort_values("time_idx").reset_index(drop=True) | |
| logger.info( | |
| "Augmentation: added %d rows (%.0f%%) → total %d rows", | |
| n_augment, augment_ratio * 100, len(combined), | |
| ) | |
| return combined | |