"""Physics-inspired feature engineering and selection for Kepler KOI data.""" import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler RANDOM_STATE = 42 TOP_K_FEATURES = 20 def _safe_divide(a: pd.Series, b: pd.Series) -> pd.Series: """Divide with zero handling.""" return np.where(b != 0, a / b, 0) def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame: """Add physics-inspired features. Returns df with new columns.""" df = df.copy() if "koi_period" in df.columns: df["period_squared"] = df["koi_period"] ** 2 if "koi_model_snr" in df.columns and "koi_depth" in df.columns: df["snr_depth_product"] = df["koi_model_snr"] * df["koi_depth"] if "koi_depth" in df.columns and "koi_srad" in df.columns: df["relative_depth"] = _safe_divide(df["koi_depth"], df["koi_srad"]) if "koi_slogg" in df.columns and "koi_srad" in df.columns: df["stellar_density_proxy"] = _safe_divide(df["koi_slogg"], df["koi_srad"]) if "koi_period_err1" in df.columns and "koi_period_err2" in df.columns: df["period_uncertainty"] = np.abs(df["koi_period_err1"] - df["koi_period_err2"]) return df def select_top_features( X_train: pd.DataFrame, y_train: pd.Series, k: int = TOP_K_FEATURES ) -> list[str]: """Use Random Forest feature importance to select top k features.""" rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE) rf.fit(X_train, y_train) importance = pd.Series(rf.feature_importances_, index=X_train.columns) return importance.nlargest(k).index.tolist() def build_feature_pipeline(X_train: pd.DataFrame, y_train: pd.Series) -> tuple: """ Build preprocessing pipeline: add features, select top k, scale. Returns (feature_names, scaler, selected_columns). """ X = add_engineered_features(X_train) selected = select_top_features(X, y_train, k=TOP_K_FEATURES) X_selected = X[selected] scaler = StandardScaler() scaler.fit(X_selected) return selected, scaler