File size: 2,107 Bytes
978fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Physics-inspired feature engineering and selection for Kepler KOI data."""

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 42
TOP_K_FEATURES = 20


def _safe_divide(a: pd.Series, b: pd.Series) -> pd.Series:
    """Divide with zero handling."""
    return np.where(b != 0, a / b, 0)


def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add physics-inspired features. Returns df with new columns."""
    df = df.copy()

    if "koi_period" in df.columns:
        df["period_squared"] = df["koi_period"] ** 2

    if "koi_model_snr" in df.columns and "koi_depth" in df.columns:
        df["snr_depth_product"] = df["koi_model_snr"] * df["koi_depth"]

    if "koi_depth" in df.columns and "koi_srad" in df.columns:
        df["relative_depth"] = _safe_divide(df["koi_depth"], df["koi_srad"])

    if "koi_slogg" in df.columns and "koi_srad" in df.columns:
        df["stellar_density_proxy"] = _safe_divide(df["koi_slogg"], df["koi_srad"])

    if "koi_period_err1" in df.columns and "koi_period_err2" in df.columns:
        df["period_uncertainty"] = np.abs(df["koi_period_err1"] - df["koi_period_err2"])

    return df


def select_top_features(
    X_train: pd.DataFrame, y_train: pd.Series, k: int = TOP_K_FEATURES
) -> list[str]:
    """Use Random Forest feature importance to select top k features."""
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rf.fit(X_train, y_train)
    importance = pd.Series(rf.feature_importances_, index=X_train.columns)
    return importance.nlargest(k).index.tolist()


def build_feature_pipeline(X_train: pd.DataFrame, y_train: pd.Series) -> tuple:
    """
    Build preprocessing pipeline: add features, select top k, scale.
    Returns (feature_names, scaler, selected_columns).
    """
    X = add_engineered_features(X_train)
    selected = select_top_features(X, y_train, k=TOP_K_FEATURES)
    X_selected = X[selected]
    scaler = StandardScaler()
    scaler.fit(X_selected)
    return selected, scaler