Spaces:
Sleeping
Sleeping
File size: 10,340 Bytes
a4b5ecb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | # Generated by Claude Code -- 2026-02-08
"""Data augmentation for the conjunction prediction dataset.
The fundamental problem: only 67 high-risk events out of 13,154 in training (0.5%).
This module provides two augmentation strategies:
1. SPACE-TRACK INTEGRATION: Merge real high-risk CDMs from Space-Track's cdm_public
feed. These have fewer features (16 vs 103) but provide real positive examples.
2. TIME-SERIES AUGMENTATION: Create synthetic variants of existing high-risk events
by applying realistic perturbations:
- Gaussian noise on covariance/position/velocity features
- Temporal jittering (shift CDM creation times slightly)
- Feature dropout (randomly zero out some features, simulating missing data)
- Sequence truncation (remove early CDMs, simulating late detection)
Both strategies are physics-aware: they don't generate impossible configurations
(e.g., negative miss distances or covariance values).
"""
import numpy as np
import pandas as pd
from pathlib import Path
def augment_event_noise(
event_df: pd.DataFrame,
noise_scale: float = 0.05,
n_augments: int = 5,
rng: np.random.Generator = None,
) -> list[pd.DataFrame]:
"""
Create n_augments noisy variants of a single conjunction event.
Applies Gaussian noise to numeric features, scaled by each column's
standard deviation within the event. Preserves event_id structure and
ensures physical constraints (non-negative distances, etc.).
"""
if rng is None:
rng = np.random.default_rng(42)
# Identify numeric columns to perturb (exclude IDs and targets)
exclude = {"event_id", "time_to_tca", "risk", "mission_id", "source"}
numeric_cols = event_df.select_dtypes(include=[np.number]).columns
perturb_cols = [c for c in numeric_cols if c not in exclude]
augmented = []
for i in range(n_augments):
aug = event_df.copy()
for col in perturb_cols:
values = aug[col].values.astype(float)
col_std = np.std(values)
if col_std < 1e-10:
col_std = np.abs(np.mean(values)) * 0.01 + 1e-10
noise = rng.normal(0, noise_scale * col_std, size=len(values))
aug[col] = values + noise
# Physical constraints
if "miss_distance" in aug.columns:
aug["miss_distance"] = aug["miss_distance"].clip(lower=0)
if "relative_speed" in aug.columns:
aug["relative_speed"] = aug["relative_speed"].clip(lower=0)
# Ensure covariance sigma columns stay positive
sigma_cols = [c for c in perturb_cols if "sigma" in c.lower()]
for col in sigma_cols:
aug[col] = aug[col].clip(lower=0)
augmented.append(aug)
return augmented
def augment_event_truncate(
event_df: pd.DataFrame,
min_keep: int = 3,
n_augments: int = 3,
rng: np.random.Generator = None,
) -> list[pd.DataFrame]:
"""
Create truncated variants by removing early CDMs.
Simulates late-detection scenarios where only the most recent CDMs
are available (closer to TCA).
"""
if rng is None:
rng = np.random.default_rng(42)
# Sort by time_to_tca descending (first CDM = furthest from TCA)
event_df = event_df.sort_values("time_to_tca", ascending=False)
n_cdms = len(event_df)
if n_cdms <= min_keep:
return []
augmented = []
for _ in range(n_augments):
# Keep between min_keep and n_cdms-1 CDMs (always keep the last few)
n_keep = rng.integers(min_keep, n_cdms)
aug = event_df.iloc[-n_keep:].copy()
augmented.append(aug)
return augmented
def augment_positive_events(
df: pd.DataFrame,
target_ratio: float = 0.05,
noise_scale: float = 0.05,
seed: int = 42,
) -> pd.DataFrame:
"""
Augment the positive (high-risk) class to reach target_ratio.
Args:
df: full training DataFrame with event_id, risk columns
target_ratio: desired fraction of high-risk events (default 5%)
noise_scale: std dev of Gaussian noise as fraction of feature std
seed: random seed
Returns:
Augmented DataFrame with new synthetic positive events appended
"""
rng = np.random.default_rng(seed)
# Find positive events
event_risks = df.groupby("event_id")["risk"].last()
pos_event_ids = event_risks[event_risks > -5].index.tolist()
neg_event_ids = event_risks[event_risks <= -5].index.tolist()
n_pos = len(pos_event_ids)
n_neg = len(neg_event_ids)
n_total = n_pos + n_neg
# How many positive events do we need?
target_pos = int(target_ratio * (n_total / (1 - target_ratio)))
n_needed = max(0, target_pos - n_pos)
if n_needed == 0:
print(f"Already at target ratio ({n_pos}/{n_total} = {n_pos/n_total:.1%})")
return df
print(f"Augmenting: {n_pos} positive events → {n_pos + n_needed} "
f"(target {target_ratio:.0%} of {n_total + n_needed})")
# Generate augmented events
max_event_id = df["event_id"].max()
augmented_dfs = []
generated = 0
while generated < n_needed:
# Pick a random positive event to augment
src_event_id = rng.choice(pos_event_ids)
src_event = df[df["event_id"] == src_event_id]
# Apply noise augmentation
aug_variants = augment_event_noise(
src_event, noise_scale=noise_scale, n_augments=1, rng=rng
)
# Also try truncation sometimes
if rng.random() < 0.3 and len(src_event) > 3:
trunc_variants = augment_event_truncate(
src_event, n_augments=1, rng=rng
)
aug_variants.extend(trunc_variants)
for aug_df in aug_variants:
if generated >= n_needed:
break
max_event_id += 1
aug_df = aug_df.copy()
aug_df["event_id"] = max_event_id
aug_df["source"] = "augmented"
augmented_dfs.append(aug_df)
generated += 1
if augmented_dfs:
augmented = pd.concat(augmented_dfs, ignore_index=True)
result = pd.concat([df, augmented], ignore_index=True)
# Verify
event_risks = result.groupby("event_id")["risk"].last()
new_pos = (event_risks > -5).sum()
new_total = len(event_risks)
print(f"Result: {new_pos} positive / {new_total} total "
f"({new_pos/new_total:.1%})")
return result
return df
def integrate_spacetrack_positives(
kelvins_df: pd.DataFrame,
spacetrack_path: Path,
) -> pd.DataFrame:
"""
Add Space-Track emergency CDMs as additional positive training examples.
Since Space-Track cdm_public has only 16 features vs Kelvins' 103,
missing features are filled with 0. The model will learn to use whatever
features are available.
"""
if not spacetrack_path.exists():
print(f"No Space-Track data at {spacetrack_path}")
return kelvins_df
from src.data.merge_sources import (
load_spacetrack_cdms, group_into_events, merge_datasets
)
st_df = load_spacetrack_cdms(spacetrack_path)
st_df = group_into_events(st_df)
merged = merge_datasets(kelvins_df, st_df)
return merged
def build_augmented_training_set(
data_dir: Path,
target_positive_ratio: float = 0.05,
noise_scale: float = 0.05,
seed: int = 42,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Build the full augmented training set from all available sources.
Steps:
1. Load ESA Kelvins train/test
2. Merge Space-Track emergency CDMs into training set
3. Apply time-series augmentation to positive events
4. Return (augmented_train, original_test)
Test set is NEVER augmented — it stays as Kelvins-only for fair evaluation.
"""
from src.data.cdm_loader import load_dataset
print("=" * 60)
print(" Building Augmented Training Set")
print("=" * 60)
# Step 1: Load Kelvins
print("\n1. Loading ESA Kelvins dataset ...")
train_df, test_df = load_dataset(data_dir / "cdm")
# Defragment and tag source
train_df = train_df.copy()
test_df = test_df.copy()
train_df["source"] = "kelvins"
test_df["source"] = "kelvins"
# Count initial positives
event_risks = train_df.groupby("event_id")["risk"].last()
n_pos_initial = (event_risks > -5).sum()
n_total_initial = len(event_risks)
print(f" Initial: {n_pos_initial} positive / {n_total_initial} total "
f"({n_pos_initial/n_total_initial:.2%})")
# Step 2: Space-Track integration
st_path = data_dir / "cdm_spacetrack" / "cdm_spacetrack_emergency.csv"
if st_path.exists():
print(f"\n2. Integrating Space-Track emergency CDMs ...")
train_df = integrate_spacetrack_positives(train_df, st_path)
else:
print(f"\n2. No Space-Track data found (skipping)")
# Step 3: Time-series augmentation
print(f"\n3. Augmenting positive events (target ratio: {target_positive_ratio:.0%}) ...")
train_df = augment_positive_events(
train_df,
target_ratio=target_positive_ratio,
noise_scale=noise_scale,
seed=seed,
)
# Final stats
event_risks = train_df.groupby("event_id")["risk"].last()
event_sources = train_df.groupby("event_id")["source"].first()
n_kelvins = (event_sources == "kelvins").sum()
n_spacetrack = (event_sources == "spacetrack").sum()
n_augmented = (event_sources == "augmented").sum()
n_pos_final = (event_risks > -5).sum()
n_total_final = len(event_risks)
print(f"\n{'=' * 60}")
print(f" Final Training Set:")
print(f" Kelvins events: {n_kelvins}")
print(f" Space-Track events: {n_spacetrack}")
print(f" Augmented events: {n_augmented}")
print(f" Total events: {n_total_final}")
print(f" Positive events: {n_pos_final} ({n_pos_final/n_total_final:.1%})")
print(f" Total CDM rows: {len(train_df)}")
print(f"{'=' * 60}")
return train_df, test_df
|