"""Data loading and preprocessing for Kepler KOI exoplanet detection.""" from pathlib import Path import numpy as np import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split RANDOM_STATE = 42 # Identifier columns to drop (non-numeric, not useful for classification) ID_COLUMNS = [ "rowid", "kepid", "kepoi_name", "kepler_name", "koi_tce_plnt_num", "koi_tce_delivname", "ra", "dec", ] def load_and_preprocess(data_path: Path) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: """ Load cumulative.csv, filter CONFIRMED/FALSE POSITIVE, preprocess, split. Returns (X_train, y_train, X_test, y_test). """ df = pd.read_csv(data_path) # Filter target classes valid = df["koi_disposition"].isin(["CONFIRMED", "FALSE POSITIVE"]) df = df[valid].copy() # Encode labels: CONFIRMED=1, FALSE POSITIVE=0 y = (df["koi_disposition"] == "CONFIRMED").astype(int) df = df.drop(columns=["koi_disposition"]) # Drop identifier columns (keep only if present) to_drop = [c for c in ID_COLUMNS if c in df.columns] df = df.drop(columns=to_drop, errors="ignore") # Drop non-numeric columns numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() df = df[numeric_cols].copy() # Handle missing values: median imputation, then 0 for any remaining df = df.fillna(df.median()) df = df.fillna(0) df = df.replace([np.inf, -np.inf], 0) # Stratified 80/20 split X_train, X_test, y_train, y_test = train_test_split( df, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE ) # SMOTE on training data only smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5) X_train, y_train = smote.fit_resample(X_train, y_train) return X_train, y_train, X_test, y_test