File size: 10,340 Bytes
a4b5ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# Generated by Claude Code -- 2026-02-08
"""Data augmentation for the conjunction prediction dataset.



The fundamental problem: only 67 high-risk events out of 13,154 in training (0.5%).

This module provides two augmentation strategies:



1. SPACE-TRACK INTEGRATION: Merge real high-risk CDMs from Space-Track's cdm_public

   feed. These have fewer features (16 vs 103) but provide real positive examples.



2. TIME-SERIES AUGMENTATION: Create synthetic variants of existing high-risk events

   by applying realistic perturbations:

   - Gaussian noise on covariance/position/velocity features

   - Temporal jittering (shift CDM creation times slightly)

   - Feature dropout (randomly zero out some features, simulating missing data)

   - Sequence truncation (remove early CDMs, simulating late detection)



Both strategies are physics-aware: they don't generate impossible configurations

(e.g., negative miss distances or covariance values).

"""

import numpy as np
import pandas as pd
from pathlib import Path


def augment_event_noise(

    event_df: pd.DataFrame,

    noise_scale: float = 0.05,

    n_augments: int = 5,

    rng: np.random.Generator = None,

) -> list[pd.DataFrame]:
    """

    Create n_augments noisy variants of a single conjunction event.



    Applies Gaussian noise to numeric features, scaled by each column's

    standard deviation within the event. Preserves event_id structure and

    ensures physical constraints (non-negative distances, etc.).

    """
    if rng is None:
        rng = np.random.default_rng(42)

    # Identify numeric columns to perturb (exclude IDs and targets)
    exclude = {"event_id", "time_to_tca", "risk", "mission_id", "source"}
    numeric_cols = event_df.select_dtypes(include=[np.number]).columns
    perturb_cols = [c for c in numeric_cols if c not in exclude]

    augmented = []
    for i in range(n_augments):
        aug = event_df.copy()

        for col in perturb_cols:
            values = aug[col].values.astype(float)
            col_std = np.std(values)
            if col_std < 1e-10:
                col_std = np.abs(np.mean(values)) * 0.01 + 1e-10

            noise = rng.normal(0, noise_scale * col_std, size=len(values))
            aug[col] = values + noise

        # Physical constraints
        if "miss_distance" in aug.columns:
            aug["miss_distance"] = aug["miss_distance"].clip(lower=0)
        if "relative_speed" in aug.columns:
            aug["relative_speed"] = aug["relative_speed"].clip(lower=0)

        # Ensure covariance sigma columns stay positive
        sigma_cols = [c for c in perturb_cols if "sigma" in c.lower()]
        for col in sigma_cols:
            aug[col] = aug[col].clip(lower=0)

        augmented.append(aug)

    return augmented


def augment_event_truncate(

    event_df: pd.DataFrame,

    min_keep: int = 3,

    n_augments: int = 3,

    rng: np.random.Generator = None,

) -> list[pd.DataFrame]:
    """

    Create truncated variants by removing early CDMs.



    Simulates late-detection scenarios where only the most recent CDMs

    are available (closer to TCA).

    """
    if rng is None:
        rng = np.random.default_rng(42)

    # Sort by time_to_tca descending (first CDM = furthest from TCA)
    event_df = event_df.sort_values("time_to_tca", ascending=False)
    n_cdms = len(event_df)

    if n_cdms <= min_keep:
        return []

    augmented = []
    for _ in range(n_augments):
        # Keep between min_keep and n_cdms-1 CDMs (always keep the last few)
        n_keep = rng.integers(min_keep, n_cdms)
        aug = event_df.iloc[-n_keep:].copy()
        augmented.append(aug)

    return augmented


def augment_positive_events(

    df: pd.DataFrame,

    target_ratio: float = 0.05,

    noise_scale: float = 0.05,

    seed: int = 42,

) -> pd.DataFrame:
    """

    Augment the positive (high-risk) class to reach target_ratio.



    Args:

        df: full training DataFrame with event_id, risk columns

        target_ratio: desired fraction of high-risk events (default 5%)

        noise_scale: std dev of Gaussian noise as fraction of feature std

        seed: random seed



    Returns:

        Augmented DataFrame with new synthetic positive events appended

    """
    rng = np.random.default_rng(seed)

    # Find positive events
    event_risks = df.groupby("event_id")["risk"].last()
    pos_event_ids = event_risks[event_risks > -5].index.tolist()
    neg_event_ids = event_risks[event_risks <= -5].index.tolist()

    n_pos = len(pos_event_ids)
    n_neg = len(neg_event_ids)
    n_total = n_pos + n_neg

    # How many positive events do we need?
    target_pos = int(target_ratio * (n_total / (1 - target_ratio)))
    n_needed = max(0, target_pos - n_pos)

    if n_needed == 0:
        print(f"Already at target ratio ({n_pos}/{n_total} = {n_pos/n_total:.1%})")
        return df

    print(f"Augmenting: {n_pos} positive events → {n_pos + n_needed} "
          f"(target {target_ratio:.0%} of {n_total + n_needed})")

    # Generate augmented events
    max_event_id = df["event_id"].max()
    augmented_dfs = []
    generated = 0

    while generated < n_needed:
        # Pick a random positive event to augment
        src_event_id = rng.choice(pos_event_ids)
        src_event = df[df["event_id"] == src_event_id]

        # Apply noise augmentation
        aug_variants = augment_event_noise(
            src_event, noise_scale=noise_scale, n_augments=1, rng=rng
        )

        # Also try truncation sometimes
        if rng.random() < 0.3 and len(src_event) > 3:
            trunc_variants = augment_event_truncate(
                src_event, n_augments=1, rng=rng
            )
            aug_variants.extend(trunc_variants)

        for aug_df in aug_variants:
            if generated >= n_needed:
                break
            max_event_id += 1
            aug_df = aug_df.copy()
            aug_df["event_id"] = max_event_id
            aug_df["source"] = "augmented"
            augmented_dfs.append(aug_df)
            generated += 1

    if augmented_dfs:
        augmented = pd.concat(augmented_dfs, ignore_index=True)
        result = pd.concat([df, augmented], ignore_index=True)

        # Verify
        event_risks = result.groupby("event_id")["risk"].last()
        new_pos = (event_risks > -5).sum()
        new_total = len(event_risks)
        print(f"Result: {new_pos} positive / {new_total} total "
              f"({new_pos/new_total:.1%})")
        return result

    return df


def integrate_spacetrack_positives(

    kelvins_df: pd.DataFrame,

    spacetrack_path: Path,

) -> pd.DataFrame:
    """

    Add Space-Track emergency CDMs as additional positive training examples.



    Since Space-Track cdm_public has only 16 features vs Kelvins' 103,

    missing features are filled with 0. The model will learn to use whatever

    features are available.

    """
    if not spacetrack_path.exists():
        print(f"No Space-Track data at {spacetrack_path}")
        return kelvins_df

    from src.data.merge_sources import (
        load_spacetrack_cdms, group_into_events, merge_datasets
    )

    st_df = load_spacetrack_cdms(spacetrack_path)
    st_df = group_into_events(st_df)

    merged = merge_datasets(kelvins_df, st_df)
    return merged


def build_augmented_training_set(

    data_dir: Path,

    target_positive_ratio: float = 0.05,

    noise_scale: float = 0.05,

    seed: int = 42,

) -> tuple[pd.DataFrame, pd.DataFrame]:
    """

    Build the full augmented training set from all available sources.



    Steps:

    1. Load ESA Kelvins train/test

    2. Merge Space-Track emergency CDMs into training set

    3. Apply time-series augmentation to positive events

    4. Return (augmented_train, original_test)



    Test set is NEVER augmented — it stays as Kelvins-only for fair evaluation.

    """
    from src.data.cdm_loader import load_dataset

    print("=" * 60)
    print("  Building Augmented Training Set")
    print("=" * 60)

    # Step 1: Load Kelvins
    print("\n1. Loading ESA Kelvins dataset ...")
    train_df, test_df = load_dataset(data_dir / "cdm")

    # Defragment and tag source
    train_df = train_df.copy()
    test_df = test_df.copy()
    train_df["source"] = "kelvins"
    test_df["source"] = "kelvins"

    # Count initial positives
    event_risks = train_df.groupby("event_id")["risk"].last()
    n_pos_initial = (event_risks > -5).sum()
    n_total_initial = len(event_risks)
    print(f"   Initial: {n_pos_initial} positive / {n_total_initial} total "
          f"({n_pos_initial/n_total_initial:.2%})")

    # Step 2: Space-Track integration
    st_path = data_dir / "cdm_spacetrack" / "cdm_spacetrack_emergency.csv"
    if st_path.exists():
        print(f"\n2. Integrating Space-Track emergency CDMs ...")
        train_df = integrate_spacetrack_positives(train_df, st_path)
    else:
        print(f"\n2. No Space-Track data found (skipping)")

    # Step 3: Time-series augmentation
    print(f"\n3. Augmenting positive events (target ratio: {target_positive_ratio:.0%}) ...")
    train_df = augment_positive_events(
        train_df,
        target_ratio=target_positive_ratio,
        noise_scale=noise_scale,
        seed=seed,
    )

    # Final stats
    event_risks = train_df.groupby("event_id")["risk"].last()
    event_sources = train_df.groupby("event_id")["source"].first()
    n_kelvins = (event_sources == "kelvins").sum()
    n_spacetrack = (event_sources == "spacetrack").sum()
    n_augmented = (event_sources == "augmented").sum()
    n_pos_final = (event_risks > -5).sum()
    n_total_final = len(event_risks)

    print(f"\n{'=' * 60}")
    print(f"  Final Training Set:")
    print(f"    Kelvins events:     {n_kelvins}")
    print(f"    Space-Track events: {n_spacetrack}")
    print(f"    Augmented events:   {n_augmented}")
    print(f"    Total events:       {n_total_final}")
    print(f"    Positive events:    {n_pos_final} ({n_pos_final/n_total_final:.1%})")
    print(f"    Total CDM rows:     {len(train_df)}")
    print(f"{'=' * 60}")

    return train_df, test_df