leonardklin's picture
Upload 328 files
978fed5 verified
"""Data loading and preprocessing for Kepler KOI exoplanet detection."""
from pathlib import Path
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
# Identifier columns to drop (non-numeric, not useful for classification)
ID_COLUMNS = [
"rowid",
"kepid",
"kepoi_name",
"kepler_name",
"koi_tce_plnt_num",
"koi_tce_delivname",
"ra",
"dec",
]
def load_and_preprocess(data_path: Path) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
"""
Load cumulative.csv, filter CONFIRMED/FALSE POSITIVE, preprocess, split.
Returns (X_train, y_train, X_test, y_test).
"""
df = pd.read_csv(data_path)
# Filter target classes
valid = df["koi_disposition"].isin(["CONFIRMED", "FALSE POSITIVE"])
df = df[valid].copy()
# Encode labels: CONFIRMED=1, FALSE POSITIVE=0
y = (df["koi_disposition"] == "CONFIRMED").astype(int)
df = df.drop(columns=["koi_disposition"])
# Drop identifier columns (keep only if present)
to_drop = [c for c in ID_COLUMNS if c in df.columns]
df = df.drop(columns=to_drop, errors="ignore")
# Drop non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df = df[numeric_cols].copy()
# Handle missing values: median imputation, then 0 for any remaining
df = df.fillna(df.median())
df = df.fillna(0)
df = df.replace([np.inf, -np.inf], 0)
# Stratified 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
df, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
# SMOTE on training data only
smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)
X_train, y_train = smote.fit_resample(X_train, y_train)
return X_train, y_train, X_test, y_test