File size: 9,820 Bytes
a4b5ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# Generated by Claude Code -- 2026-02-08
"""Merge CDM data from multiple sources into unified training format.



Combines:

  1. ESA Kelvins dataset (103 features, labeled)

  2. Space-Track cdm_public (16 features, unlabeled — derive risk from PC)



Strategy:

  - Space-Track CDMs are grouped into "conjunction events" by (SAT_1_ID, SAT_2_ID, TCA_date)

  - Each event gets a time series of CDMs ordered by CREATED date

  - Risk label derived from final PC: high risk if PC > 1e-5 (same threshold as Kelvins)

  - Features that exist in both sources get unified column names

  - Missing features (e.g., covariance in Space-Track) are filled with 0



This gives us far more positive examples for training the risk classifier,

even though the Space-Track data has fewer features per CDM.

"""

import numpy as np
import pandas as pd
from pathlib import Path
from datetime import timedelta


# Mapping from Space-Track CDM_PUBLIC fields → unified column names
SPACETRACK_COLUMN_MAP = {
    "CDM_ID": "cdm_id",
    "CREATED": "created",
    "TCA": "tca",
    "MIN_RNG": "miss_distance",      # km in Space-Track
    "PC": "collision_probability",
    "SAT_1_ID": "sat_1_id",
    "SAT_1_NAME": "sat_1_name",
    "SAT1_OBJECT_TYPE": "t_object_type",
    "SAT1_RCS": "t_rcs",
    "SAT_1_EXCL_VOL": "t_excl_vol",
    "SAT_2_ID": "sat_2_id",
    "SAT_2_NAME": "sat_2_name",
    "SAT2_OBJECT_TYPE": "c_object_type",
    "SAT2_RCS": "c_rcs",
    "SAT_2_EXCL_VOL": "c_excl_vol",
    "EMERGENCY_REPORTABLE": "emergency_reportable",
}

# Risk threshold: PC > 1e-5 = high risk (matches ESA Kelvins: risk > -5)
RISK_THRESHOLD = 1e-5


def load_spacetrack_cdms(csv_path: Path) -> pd.DataFrame:
    """Load Space-Track CDM CSV and do initial cleaning."""
    df = pd.read_csv(csv_path)

    # Rename columns to unified format
    df = df.rename(columns=SPACETRACK_COLUMN_MAP)

    # Parse dates
    for col in ["created", "tca"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Convert miss_distance to float
    if "miss_distance" in df.columns:
        df["miss_distance"] = pd.to_numeric(df["miss_distance"], errors="coerce")
        # Space-Track MIN_RNG is in km; ESA Kelvins miss_distance is in meters
        # Convert to meters for consistency
        df["miss_distance"] = df["miss_distance"] * 1000.0

    # Convert collision_probability to float
    if "collision_probability" in df.columns:
        df["collision_probability"] = pd.to_numeric(df["collision_probability"], errors="coerce")

    # Derive risk column (log10 of PC, matching ESA format)
    if "collision_probability" in df.columns:
        df["risk"] = np.where(
            df["collision_probability"] > 0,
            np.log10(df["collision_probability"].clip(lower=1e-30)),
            -30.0,
        )

    print(f"Loaded {len(df)} Space-Track CDMs from {csv_path.name}")
    return df


def group_into_events(df: pd.DataFrame) -> pd.DataFrame:
    """

    Group Space-Track CDMs into conjunction events.



    An 'event' is a sequence of CDMs for the same object pair with TCA

    values within 1 day of each other. Each event gets a unique event_id.

    """
    if df.empty:
        return df

    # Sort by object pair and TCA
    df = df.sort_values(["sat_1_id", "sat_2_id", "tca", "created"]).reset_index(drop=True)

    # Assign event IDs: same pair + TCA within 1 day = same event
    event_ids = []
    current_event = 0
    prev_sat1 = None
    prev_sat2 = None
    prev_tca = None

    for _, row in df.iterrows():
        sat1 = row.get("sat_1_id")
        sat2 = row.get("sat_2_id")
        tca = row.get("tca")

        same_pair = (sat1 == prev_sat1 and sat2 == prev_sat2)
        close_tca = False
        if same_pair and prev_tca is not None and pd.notna(tca) and pd.notna(prev_tca):
            close_tca = abs((tca - prev_tca).total_seconds()) < 86400  # 1 day

        if not (same_pair and close_tca):
            current_event += 1

        event_ids.append(current_event)
        prev_sat1 = sat1
        prev_sat2 = sat2
        prev_tca = tca

    df["event_id"] = event_ids

    # Compute time_to_tca: days from CDM creation to TCA (for each CDM in event)
    if "created" in df.columns and "tca" in df.columns:
        df["time_to_tca"] = (df["tca"] - df["created"]).dt.total_seconds() / 86400.0
        df["time_to_tca"] = df["time_to_tca"].clip(lower=0.0)

    n_events = df["event_id"].nunique()
    n_high_risk = 0
    if "risk" in df.columns:
        event_risks = df.groupby("event_id")["risk"].last()
        n_high_risk = (event_risks > -5).sum()

    print(f"Grouped into {n_events} events ({n_high_risk} high-risk)")
    return df


def compute_relative_speed_from_excl_vol(df: pd.DataFrame) -> pd.DataFrame:
    """Estimate relative speed from exclusion volumes if available."""
    # excl_vol is in km, but we can't derive speed from it alone
    # Just ensure the column exists for compatibility
    if "relative_speed" not in df.columns:
        df["relative_speed"] = 0.0
    return df


def align_with_kelvins_schema(

    spacetrack_df: pd.DataFrame,

    kelvins_df: pd.DataFrame,

) -> pd.DataFrame:
    """

    Align Space-Track data columns with Kelvins schema.

    Missing columns get filled with 0.

    """
    # Get all columns from Kelvins
    kelvins_cols = set(kelvins_df.columns)
    st_cols = set(spacetrack_df.columns)

    # Add missing numeric columns as 0
    for col in kelvins_cols:
        if col not in st_cols:
            spacetrack_df[col] = 0.0

    # Keep only columns that exist in Kelvins + our extra metadata
    extra_cols = {"sat_1_id", "sat_2_id", "sat_1_name", "sat_2_name",
                  "t_object_type", "collision_probability", "created", "tca",
                  "cdm_id", "emergency_reportable", "t_rcs", "c_rcs",
                  "t_excl_vol", "c_excl_vol", "source"}
    keep_cols = list(kelvins_cols | extra_cols)
    available = [c for c in keep_cols if c in spacetrack_df.columns]
    return spacetrack_df[available]


def merge_datasets(

    kelvins_train_df: pd.DataFrame,

    spacetrack_df: pd.DataFrame,

    offset_event_ids: bool = True,

) -> pd.DataFrame:
    """

    Merge Kelvins training data with Space-Track CDMs.



    Args:

        kelvins_train_df: ESA Kelvins training DataFrame

        spacetrack_df: Space-Track CDMs (already grouped into events)

        offset_event_ids: shift Space-Track event_ids to avoid collisions



    Returns:

        Combined DataFrame ready for model training

    """
    # Tag sources
    kelvins_train_df = kelvins_train_df.copy()
    kelvins_train_df["source"] = "kelvins"

    spacetrack_df = spacetrack_df.copy()
    spacetrack_df["source"] = "spacetrack"

    # Offset Space-Track event IDs to avoid collision with Kelvins IDs
    if offset_event_ids and "event_id" in kelvins_train_df.columns:
        max_kelvins_id = kelvins_train_df["event_id"].max()
        spacetrack_df["event_id"] = spacetrack_df["event_id"] + max_kelvins_id + 1

    # Align columns
    spacetrack_df = align_with_kelvins_schema(spacetrack_df, kelvins_train_df)

    # Concatenate
    combined = pd.concat([kelvins_train_df, spacetrack_df], ignore_index=True)

    # Fill any remaining NaN
    numeric_cols = combined.select_dtypes(include=[np.number]).columns
    combined[numeric_cols] = combined[numeric_cols].fillna(0)

    n_kelvins = kelvins_train_df["event_id"].nunique()
    n_st = spacetrack_df["event_id"].nunique()
    n_total = combined["event_id"].nunique()

    # Count high-risk events per source
    event_risk = combined.groupby(["event_id", "source"])["risk"].last().reset_index()
    n_hr_kelvins = ((event_risk["source"] == "kelvins") & (event_risk["risk"] > -5)).sum()
    n_hr_st = ((event_risk["source"] == "spacetrack") & (event_risk["risk"] > -5)).sum()

    print(f"\nMerged dataset:")
    print(f"  Kelvins:     {n_kelvins} events ({n_hr_kelvins} high-risk)")
    print(f"  Space-Track: {n_st} events ({n_hr_st} high-risk)")
    print(f"  Total:       {n_total} events ({n_hr_kelvins + n_hr_st} high-risk)")
    print(f"  Columns:     {len(combined.columns)}")

    return combined


def load_and_merge_all(data_dir: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    """

    Load all available data sources and merge into train/test DataFrames.



    Returns (train_df, test_df) — test is Kelvins-only (for fair comparison).

    """
    from src.data.cdm_loader import load_dataset

    # Load ESA Kelvins
    kelvins_dir = data_dir / "cdm"
    kelvins_train, kelvins_test = load_dataset(kelvins_dir)

    # Load Space-Track data if available
    spacetrack_dir = data_dir / "cdm_spacetrack"
    spacetrack_files = list(spacetrack_dir.glob("cdm_*.csv")) if spacetrack_dir.exists() else []

    if not spacetrack_files:
        print("\nNo Space-Track data found. Using Kelvins only.")
        return kelvins_train, kelvins_test

    # Load and merge all Space-Track CSVs
    st_dfs = []
    for f in spacetrack_files:
        if f.name.startswith("checkpoint"):
            continue
        df = load_spacetrack_cdms(f)
        df = group_into_events(df)
        df = compute_relative_speed_from_excl_vol(df)
        st_dfs.append(df)

    if st_dfs:
        all_st = pd.concat(st_dfs, ignore_index=True)
        # Re-assign event IDs after concatenation
        all_st = group_into_events(all_st)
        merged_train = merge_datasets(kelvins_train, all_st)
    else:
        merged_train = kelvins_train

    # Test set stays Kelvins-only for fair benchmarking
    return merged_train, kelvins_test