EXONYX / scripts /setup_v5_env.py
Aditya-Jadhav150
Deploy clean EXONYX Backend
8f0e1cb
Raw
History Blame Contribute Delete
3.23 kB
import os
import requests
import pandas as pd
from sklearn.model_selection import train_test_split
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_CACHE_DIR = os.path.join(BASE_DIR, "data_cache")
DATASETS_DIR = os.path.join(BASE_DIR, "datasets")
# Phase 1: Local Data Lake Structure
subdirs = [
"kepler", "tess", "koi", "benchmarks",
"training", "models", "reports", "mcmc"
]
print("Initializing Phase 1: Local Data Lake...")
for sub in subdirs:
p = os.path.join(DATA_CACHE_DIR, sub)
os.makedirs(p, exist_ok=True)
print(f" Created: {p}")
os.makedirs(DATASETS_DIR, exist_ok=True)
# Phase 2: Dataset Curation Pipeline
print("\nInitializing Phase 2: Dataset Curation Pipeline...")
KOI_API_URL = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative&select=kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_period,koi_depth,koi_duration,koi_prad,koi_sma,koi_teq,koi_model_snr&format=csv"
RAW_KOI_PATH = os.path.join(DATA_CACHE_DIR, "koi", "cumulative_raw.csv")
if not os.path.exists(RAW_KOI_PATH):
print(" Fetching Kepler KOI cumulative table from NASA Exoplanet Archive (~5MB)...")
res = requests.get(KOI_API_URL)
with open(RAW_KOI_PATH, "wb") as f:
f.write(res.content)
print(" Download complete.")
else:
print(" KOI cumulative table already exists in cache.")
# Process into confirmed planets and false positives
df = pd.read_csv(RAW_KOI_PATH)
print(f" Total KOIs loaded: {len(df)}")
# Filter out targets with null period or depth as they are required for TLS simulation
df = df.dropna(subset=['koi_period', 'koi_depth', 'koi_duration'])
confirmed = df[df['koi_disposition'] == 'CONFIRMED']
false_pos = df[df['koi_disposition'] == 'FALSE POSITIVE']
candidates = df[df['koi_disposition'] == 'CANDIDATE']
confirmed.to_csv(os.path.join(DATASETS_DIR, "confirmed_planets.csv"), index=False)
false_pos.to_csv(os.path.join(DATASETS_DIR, "false_positives.csv"), index=False)
print(f" Saved {len(confirmed)} Confirmed Planets")
print(f" Saved {len(false_pos)} False Positives")
print(f" Saved {len(candidates)} Candidates")
# Create Train / Val / Test splits (80 / 10 / 10)
# Label 1 = CONFIRMED, Label 0 = FALSE POSITIVE
confirmed_labeled = confirmed.copy()
confirmed_labeled['label'] = 1
false_pos_labeled = false_pos.copy()
false_pos_labeled['label'] = 0
# Limit false positives to balance dataset roughly 2:1 or 1:1 if desired, but for now take all to let network learn
combined = pd.concat([confirmed_labeled, false_pos_labeled]).sample(frac=1, random_state=42).reset_index(drop=True)
train_df, temp_df = train_test_split(combined, test_size=0.2, random_state=42, stratify=combined['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])
train_df.to_csv(os.path.join(DATASETS_DIR, "train_split.csv"), index=False)
val_df.to_csv(os.path.join(DATASETS_DIR, "validation_split.csv"), index=False)
test_df.to_csv(os.path.join(DATASETS_DIR, "test_split.csv"), index=False)
print(f" Created Splits: Train({len(train_df)}), Val({len(val_df)}), Test({len(test_df)})")
print("\nPhase 1 and 2 Initialization Complete!")