File size: 1,869 Bytes
978fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""Data loading and preprocessing for Kepler KOI exoplanet detection."""

from pathlib import Path

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

# Identifier columns to drop (non-numeric, not useful for classification)
ID_COLUMNS = [
    "rowid",
    "kepid",
    "kepoi_name",
    "kepler_name",
    "koi_tce_plnt_num",
    "koi_tce_delivname",
    "ra",
    "dec",
]


def load_and_preprocess(data_path: Path) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    """
    Load cumulative.csv, filter CONFIRMED/FALSE POSITIVE, preprocess, split.
    Returns (X_train, y_train, X_test, y_test).
    """
    df = pd.read_csv(data_path)

    # Filter target classes
    valid = df["koi_disposition"].isin(["CONFIRMED", "FALSE POSITIVE"])
    df = df[valid].copy()

    # Encode labels: CONFIRMED=1, FALSE POSITIVE=0
    y = (df["koi_disposition"] == "CONFIRMED").astype(int)
    df = df.drop(columns=["koi_disposition"])

    # Drop identifier columns (keep only if present)
    to_drop = [c for c in ID_COLUMNS if c in df.columns]
    df = df.drop(columns=to_drop, errors="ignore")

    # Drop non-numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    df = df[numeric_cols].copy()

    # Handle missing values: median imputation, then 0 for any remaining
    df = df.fillna(df.median())
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)

    # Stratified 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(
        df, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    # SMOTE on training data only
    smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=5)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    return X_train, y_train, X_test, y_test