leonardklin's picture
Upload 328 files
978fed5 verified
"""Physics-inspired feature engineering and selection for Kepler KOI data."""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
RANDOM_STATE = 42
TOP_K_FEATURES = 20
def _safe_divide(a: pd.Series, b: pd.Series) -> pd.Series:
"""Divide with zero handling."""
return np.where(b != 0, a / b, 0)
def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
"""Add physics-inspired features. Returns df with new columns."""
df = df.copy()
if "koi_period" in df.columns:
df["period_squared"] = df["koi_period"] ** 2
if "koi_model_snr" in df.columns and "koi_depth" in df.columns:
df["snr_depth_product"] = df["koi_model_snr"] * df["koi_depth"]
if "koi_depth" in df.columns and "koi_srad" in df.columns:
df["relative_depth"] = _safe_divide(df["koi_depth"], df["koi_srad"])
if "koi_slogg" in df.columns and "koi_srad" in df.columns:
df["stellar_density_proxy"] = _safe_divide(df["koi_slogg"], df["koi_srad"])
if "koi_period_err1" in df.columns and "koi_period_err2" in df.columns:
df["period_uncertainty"] = np.abs(df["koi_period_err1"] - df["koi_period_err2"])
return df
def select_top_features(
X_train: pd.DataFrame, y_train: pd.Series, k: int = TOP_K_FEATURES
) -> list[str]:
"""Use Random Forest feature importance to select top k features."""
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train, y_train)
importance = pd.Series(rf.feature_importances_, index=X_train.columns)
return importance.nlargest(k).index.tolist()
def build_feature_pipeline(X_train: pd.DataFrame, y_train: pd.Series) -> tuple:
"""
Build preprocessing pipeline: add features, select top k, scale.
Returns (feature_names, scaler, selected_columns).
"""
X = add_engineered_features(X_train)
selected = select_top_features(X, y_train, k=TOP_K_FEATURES)
X_selected = X[selected]
scaler = StandardScaler()
scaler.fit(X_selected)
return selected, scaler