Spaces:

ifieryarrows
/

copper-mind

Running

App Files Files Community

copper-mind / deep_learning /data /augmentation.py

ifieryarrows

Sync from GitHub (tests passed)

18d4089 verified 25 days ago

raw

history blame contribute delete

3.87 kB

	"""
	Time Series Data Augmentation for TFT-ASRO.

	Applies conservative augmentation techniques to increase effective training
	set size without introducing unrealistic patterns.

	Techniques:
	- Jittering: Add small Gaussian noise to feature values
	- Magnitude Warping: Scale features by small random factors
	- Window Slicing: Create shifted sub-windows from the training data

	Reference: Um et al. (2017) "Data Augmentation of Wearable Sensor Data" (ICMI)
	"""

	from __future__ import annotations

	import logging

	import numpy as np
	import pandas as pd

	logger = logging.getLogger(__name__)


	def jitter(
	df: pd.DataFrame,
	feature_cols: list[str],
	sigma: float = 0.005,
	seed: int = 42,
	) -> pd.DataFrame:
	"""
	Add Gaussian noise to feature columns.

	The noise magnitude is relative to each feature's standard deviation
	to maintain scale consistency across features with different ranges.
	"""
	rng = np.random.RandomState(seed)
	augmented = df.copy()

	for col in feature_cols:
	col_std = augmented[col].std()
	if col_std < 1e-12:
	continue
	noise = rng.normal(0, sigma * col_std, size=len(augmented))
	augmented[col] = augmented[col] + noise

	return augmented


	def magnitude_warp(
	df: pd.DataFrame,
	feature_cols: list[str],
	sigma: float = 0.02,
	seed: int = 43,
	) -> pd.DataFrame:
	"""
	Multiply feature values by smooth random factors centered at 1.0.

	Uses cubic spline interpolation over a few knots to create slowly-varying
	scale factors, preserving local structure.
	"""
	from scipy.interpolate import CubicSpline

	rng = np.random.RandomState(seed)
	augmented = df.copy()
	n = len(augmented)
	n_knots = 4
	knot_positions = np.linspace(0, n - 1, n_knots)
	x = np.arange(n)

	for col in feature_cols:
	knot_values = rng.normal(1.0, sigma, size=n_knots)
	cs = CubicSpline(knot_positions, knot_values)
	warp_factor = cs(x)
	augmented[col] = augmented[col] * warp_factor

	return augmented


	def augment_training_data(
	df: pd.DataFrame,
	feature_cols: list[str],
	target_col: str = "target",
	augment_ratio: float = 0.15,
	seed: int = 42,
	) -> pd.DataFrame:
	"""
	Augment training DataFrame with jittered and warped copies.

	Appends augmented rows to the original, preserving time_idx ordering
	by offsetting augmented indices past the original range.

	Args:
	df: Training DataFrame (must have time_idx and group_id).
	feature_cols: Feature columns to augment (target is preserved exact).
	augment_ratio: Fraction of original data to add (0.15 = 15%).
	seed: Random seed.

	Returns:
	Augmented DataFrame with updated time_idx for new rows.
	"""
	n_original = len(df)
	n_augment = int(n_original * augment_ratio)
	if n_augment < 10:
	logger.info("Augmentation: ratio=%.2f yields <10 rows, skipping", augment_ratio)
	return df

	rng = np.random.RandomState(seed)
	sample_idx = rng.choice(n_original, size=n_augment, replace=False)
	sample = df.iloc[sample_idx].copy()

	aug_features = [c for c in feature_cols if c != target_col]

	aug_jitter = jitter(sample, aug_features, sigma=0.005, seed=seed)
	aug_warped = magnitude_warp(aug_jitter, aug_features, sigma=0.02, seed=seed + 1)

	max_time_idx = df["time_idx"].max()
	aug_warped["time_idx"] = np.arange(max_time_idx + 1, max_time_idx + 1 + n_augment)
	aug_warped["group_id"] = "copper_aug"

	combined = pd.concat([df, aug_warped], ignore_index=True)
	combined = combined.sort_values("time_idx").reset_index(drop=True)

	logger.info(
	"Augmentation: added %d rows (%.0f%%) → total %d rows",
	n_augment, augment_ratio * 100, len(combined),
	)
	return combined