| import os |
| import requests |
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
|
|
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| DATA_CACHE_DIR = os.path.join(BASE_DIR, "data_cache") |
| DATASETS_DIR = os.path.join(BASE_DIR, "datasets") |
|
|
| |
| subdirs = [ |
| "kepler", "tess", "koi", "benchmarks", |
| "training", "models", "reports", "mcmc" |
| ] |
|
|
| print("Initializing Phase 1: Local Data Lake...") |
| for sub in subdirs: |
| p = os.path.join(DATA_CACHE_DIR, sub) |
| os.makedirs(p, exist_ok=True) |
| print(f" Created: {p}") |
|
|
| os.makedirs(DATASETS_DIR, exist_ok=True) |
|
|
| |
| print("\nInitializing Phase 2: Dataset Curation Pipeline...") |
| KOI_API_URL = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative&select=kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_period,koi_depth,koi_duration,koi_prad,koi_sma,koi_teq,koi_model_snr&format=csv" |
| RAW_KOI_PATH = os.path.join(DATA_CACHE_DIR, "koi", "cumulative_raw.csv") |
|
|
| if not os.path.exists(RAW_KOI_PATH): |
| print(" Fetching Kepler KOI cumulative table from NASA Exoplanet Archive (~5MB)...") |
| res = requests.get(KOI_API_URL) |
| with open(RAW_KOI_PATH, "wb") as f: |
| f.write(res.content) |
| print(" Download complete.") |
| else: |
| print(" KOI cumulative table already exists in cache.") |
|
|
| |
| df = pd.read_csv(RAW_KOI_PATH) |
| print(f" Total KOIs loaded: {len(df)}") |
|
|
| |
| df = df.dropna(subset=['koi_period', 'koi_depth', 'koi_duration']) |
|
|
| confirmed = df[df['koi_disposition'] == 'CONFIRMED'] |
| false_pos = df[df['koi_disposition'] == 'FALSE POSITIVE'] |
| candidates = df[df['koi_disposition'] == 'CANDIDATE'] |
|
|
| confirmed.to_csv(os.path.join(DATASETS_DIR, "confirmed_planets.csv"), index=False) |
| false_pos.to_csv(os.path.join(DATASETS_DIR, "false_positives.csv"), index=False) |
|
|
| print(f" Saved {len(confirmed)} Confirmed Planets") |
| print(f" Saved {len(false_pos)} False Positives") |
| print(f" Saved {len(candidates)} Candidates") |
|
|
| |
| |
| confirmed_labeled = confirmed.copy() |
| confirmed_labeled['label'] = 1 |
|
|
| false_pos_labeled = false_pos.copy() |
| false_pos_labeled['label'] = 0 |
|
|
| |
| combined = pd.concat([confirmed_labeled, false_pos_labeled]).sample(frac=1, random_state=42).reset_index(drop=True) |
|
|
| train_df, temp_df = train_test_split(combined, test_size=0.2, random_state=42, stratify=combined['label']) |
| val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label']) |
|
|
| train_df.to_csv(os.path.join(DATASETS_DIR, "train_split.csv"), index=False) |
| val_df.to_csv(os.path.join(DATASETS_DIR, "validation_split.csv"), index=False) |
| test_df.to_csv(os.path.join(DATASETS_DIR, "test_split.csv"), index=False) |
|
|
| print(f" Created Splits: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})") |
| print("\nPhase 1 and 2 Initialization Complete!") |
|
|