V7: PS data quality fix + school pctile + ablation
Browse files- train_v38_2_pro_v7.py +1131 -0
train_v38_2_pro_v7.py
ADDED
|
@@ -0,0 +1,1131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
====================================================================
|
| 3 |
+
V38.2-PRO-V7 MODEL - PS Data Quality Fix + School Pctile + Ablation
|
| 4 |
+
====================================================================
|
| 5 |
+
Changes from V38.2-PRO-V6:
|
| 6 |
+
1. FIX #6: has_ps=0 -> ALL ps2_* scores NaN (5057 rows were polluted)
|
| 7 |
+
2. FIX #7: Residualization school_mean for PS features uses ONLY has_ps=1 rows
|
| 8 |
+
3. NEW: ps2_mean_school_pctile (continuous within-school percentile, solves granularity)
|
| 9 |
+
4. REMOVE: ps2_is_cliche_topic (53.5% prevalence, no signal)
|
| 10 |
+
5. ABLATION: ABLATE_PS_BERT flag to test removing ps_bert_pca 16 dims
|
| 11 |
+
6. All V6 fixes carried forward
|
| 12 |
+
====================================================================
|
| 13 |
+
"""
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import json, os, warnings, sys, time, pickle, gc
|
| 17 |
+
warnings.filterwarnings('ignore')
|
| 18 |
+
from sklearn.model_selection import GroupKFold
|
| 19 |
+
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
|
| 20 |
+
from sklearn.preprocessing import LabelEncoder
|
| 21 |
+
from scipy.stats import rankdata
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from catboost import CatBoostClassifier, Pool
|
| 25 |
+
import lightgbm as lgb
|
| 26 |
+
import xgboost as xgb
|
| 27 |
+
print("All model libraries loaded successfully")
|
| 28 |
+
except ImportError as e:
|
| 29 |
+
print(f"Missing library: {e}")
|
| 30 |
+
import subprocess
|
| 31 |
+
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
|
| 32 |
+
'catboost', 'lightgbm', 'xgboost', '-q'])
|
| 33 |
+
from catboost import CatBoostClassifier, Pool
|
| 34 |
+
import lightgbm as lgb
|
| 35 |
+
import xgboost as xgb
|
| 36 |
+
|
| 37 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 38 |
+
DATA_DIR = os.path.join(BASE_DIR, 'data')
|
| 39 |
+
OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
|
| 40 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
TARGET = 'target'
|
| 43 |
+
SEEDS = [42, 123, 456, 789, 2024]
|
| 44 |
+
N_FOLDS = 10
|
| 45 |
+
FEATURE_SELECT_TOP_N = 150
|
| 46 |
+
start_time = time.time()
|
| 47 |
+
|
| 48 |
+
# ============================================================
|
| 49 |
+
# ABLATION FLAGS - set to True to remove feature groups
|
| 50 |
+
# ============================================================
|
| 51 |
+
ABLATE_PS_BERT = False # Set True to remove ps_bert_pca_0..15 (16 dims)
|
| 52 |
+
|
| 53 |
+
def safe_num(v, default=np.nan):
|
| 54 |
+
"""Convert to float, return NaN for missing (was -1 before)."""
|
| 55 |
+
if isinstance(v, (int, float)):
|
| 56 |
+
val = float(v)
|
| 57 |
+
return np.nan if val == -1 else val
|
| 58 |
+
if isinstance(v, str):
|
| 59 |
+
try:
|
| 60 |
+
val = float(v)
|
| 61 |
+
return np.nan if val == -1 else val
|
| 62 |
+
except:
|
| 63 |
+
return default
|
| 64 |
+
return default
|
| 65 |
+
|
| 66 |
+
# ============================================================
|
| 67 |
+
# 1. LOAD DATA (v8 feature matrix)
|
| 68 |
+
# ============================================================
|
| 69 |
+
print("=" * 70)
|
| 70 |
+
print(" V38.2-PRO-V7: PS DATA QUALITY FIX + SCHOOL PCTILE + ABLATION")
|
| 71 |
+
print("=" * 70)
|
| 72 |
+
print(f" ABLATE_PS_BERT = {ABLATE_PS_BERT}")
|
| 73 |
+
|
| 74 |
+
# Try v8 first, fall back to v6, then v5
|
| 75 |
+
v8_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v8.csv')
|
| 76 |
+
v6_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v6.csv')
|
| 77 |
+
v5_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v5.csv')
|
| 78 |
+
if os.path.exists(v8_path):
|
| 79 |
+
df_raw = pd.read_csv(v8_path)
|
| 80 |
+
print(f"V8 features loaded: {df_raw.shape}")
|
| 81 |
+
elif os.path.exists(v6_path):
|
| 82 |
+
df_raw = pd.read_csv(v6_path)
|
| 83 |
+
print(f"V6 features loaded (v8 not found): {df_raw.shape}")
|
| 84 |
+
else:
|
| 85 |
+
df_raw = pd.read_csv(v5_path)
|
| 86 |
+
print(f"V5 features loaded: {df_raw.shape}")
|
| 87 |
+
|
| 88 |
+
# Load LLM features
|
| 89 |
+
llm_features_loaded = {}
|
| 90 |
+
for fname, varname in [
|
| 91 |
+
('llm_activity_scores.json', 'act_scores'),
|
| 92 |
+
('llm_supp_quality_all.json', 'supp_scores'),
|
| 93 |
+
('llm_major_difficulty.json', 'major_diff'),
|
| 94 |
+
('ps_yale_scores.json', 'ps_yale'),
|
| 95 |
+
]:
|
| 96 |
+
fpath = os.path.join(DATA_DIR, fname)
|
| 97 |
+
if os.path.exists(fpath):
|
| 98 |
+
with open(fpath) as f:
|
| 99 |
+
llm_features_loaded[varname] = json.load(f)
|
| 100 |
+
print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
|
| 101 |
+
else:
|
| 102 |
+
llm_features_loaded[varname] = {}
|
| 103 |
+
|
| 104 |
+
# Load raw data to get ED2 round info
|
| 105 |
+
import re
|
| 106 |
+
RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
|
| 107 |
+
round_lookup = {}
|
| 108 |
+
if os.path.exists(RAW_CSV):
|
| 109 |
+
print(f"\n Loading raw CSV for ED2 round info...")
|
| 110 |
+
try:
|
| 111 |
+
raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
|
| 112 |
+
dtype=str, chunksize=500)
|
| 113 |
+
for chunk in raw_chunks:
|
| 114 |
+
for _, row in chunk.iterrows():
|
| 115 |
+
sid = str(row.get('student_id', '')).replace('.0', '')
|
| 116 |
+
summary = str(row.get('school_results_summary', ''))
|
| 117 |
+
entries = re.split(r'(?=\d+\.)', summary)
|
| 118 |
+
for entry in entries:
|
| 119 |
+
m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
|
| 120 |
+
if m:
|
| 121 |
+
round_type = m.group(1)
|
| 122 |
+
school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
|
| 123 |
+
if school_m:
|
| 124 |
+
school_name = school_m.group(1).strip()
|
| 125 |
+
key = f"{sid}_{school_name}"
|
| 126 |
+
round_lookup[key] = round_type
|
| 127 |
+
print(f" Round lookup built: {len(round_lookup)} entries")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f" Warning: Could not load raw CSV: {e}")
|
| 130 |
+
|
| 131 |
+
# ============================================================
|
| 132 |
+
# 2. DATA CLEANING & QUALITY FIXES
|
| 133 |
+
# ============================================================
|
| 134 |
+
print(f"\n{'='*70}")
|
| 135 |
+
print(f" DATA QUALITY FIXES")
|
| 136 |
+
print(f"{'='*70}")
|
| 137 |
+
|
| 138 |
+
# 2a. Filter years
|
| 139 |
+
df = df_raw[~df_raw['year'].isin([2018, 2019])].copy()
|
| 140 |
+
df = df.reset_index(drop=True)
|
| 141 |
+
print(f"After filtering 2018-2019: {df.shape}")
|
| 142 |
+
|
| 143 |
+
# 2b. FIX #1: SAT=0 -> NaN + has_sat
|
| 144 |
+
sat_zero = (df['sat'] == 0).sum()
|
| 145 |
+
df['has_sat'] = (df['sat'] > 0).astype(int)
|
| 146 |
+
df.loc[df['sat'] == 0, 'sat'] = np.nan
|
| 147 |
+
print(f"\n FIX #1: SAT=0 -> NaN: {sat_zero} rows ({sat_zero/len(df)*100:.1f}%)")
|
| 148 |
+
print(f" has_sat=1: {df['has_sat'].sum()}, has_sat=0: {(df['has_sat']==0).sum()}")
|
| 149 |
+
|
| 150 |
+
# 2c. FIX #2: TOEFL=0 -> NaN + has_toefl
|
| 151 |
+
toefl_zero = (df['toefl'] == 0).sum()
|
| 152 |
+
df['has_toefl'] = (df['toefl'] > 0).astype(int)
|
| 153 |
+
df.loc[df['toefl'] == 0, 'toefl'] = np.nan
|
| 154 |
+
print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows ({toefl_zero/len(df)*100:.1f}%)")
|
| 155 |
+
|
| 156 |
+
# 2d. FIX #3: GPA=0 -> NaN (v5 already has has_gpa)
|
| 157 |
+
gpa_zero = (df['gpa'] == 0).sum()
|
| 158 |
+
df.loc[df['gpa'] == 0, 'gpa'] = np.nan
|
| 159 |
+
print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows ({gpa_zero/len(df)*100:.1f}%)")
|
| 160 |
+
if 'has_gpa' not in df.columns:
|
| 161 |
+
df['has_gpa'] = df['gpa'].notna().astype(int)
|
| 162 |
+
print(f" has_gpa=1: {(df['has_gpa']==1).sum()}, has_gpa=0: {(df['has_gpa']==0).sum()}")
|
| 163 |
+
|
| 164 |
+
# 2e. FIX #4: -1 -> NaN for sentinel columns
|
| 165 |
+
sentinel_cols = ['taste_yearly_admits_log']
|
| 166 |
+
for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
|
| 167 |
+
if col in df.columns:
|
| 168 |
+
sentinel_cols.append(col)
|
| 169 |
+
|
| 170 |
+
for col in sentinel_cols:
|
| 171 |
+
if col in df.columns:
|
| 172 |
+
n_neg1 = (df[col] == -1).sum()
|
| 173 |
+
df.loc[df[col] == -1, col] = np.nan
|
| 174 |
+
print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows ({n_neg1/len(df)*100:.1f}%)")
|
| 175 |
+
|
| 176 |
+
# 2f. FIX #5: has_ps=0 -> ps_bert all NaN
|
| 177 |
+
ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
|
| 178 |
+
no_ps_mask = df['has_ps'] == 0
|
| 179 |
+
if ps_bert_cols:
|
| 180 |
+
n_fix = no_ps_mask.sum()
|
| 181 |
+
for col in ps_bert_cols:
|
| 182 |
+
df.loc[no_ps_mask, col] = np.nan
|
| 183 |
+
print(f" FIX #5: ps_bert -> NaN for has_ps=0: {n_fix} rows, {len(ps_bert_cols)} columns")
|
| 184 |
+
else:
|
| 185 |
+
print(f" FIX #5: No ps_bert_pca columns found")
|
| 186 |
+
|
| 187 |
+
# 2f-v7. FIX #6 (NEW): has_ps=0 -> ALL ps2_* scores NaN
|
| 188 |
+
# Previously ps2 scores were broadcast to has_ps=0 rows (5057 polluted rows!)
|
| 189 |
+
ps2_score_cols = [c for c in df.columns if c.startswith('ps2_') and c != 'ps2_essay_type']
|
| 190 |
+
n_ps2_polluted = (no_ps_mask & df['ps2_mean'].notna()).sum()
|
| 191 |
+
for col in ps2_score_cols:
|
| 192 |
+
df.loc[no_ps_mask, col] = np.nan
|
| 193 |
+
print(f" FIX #6 (V7 NEW): ps2_* -> NaN for has_ps=0: {n_fix} rows, {len(ps2_score_cols)} cols")
|
| 194 |
+
print(f" Previously polluted ps2 rows: {n_ps2_polluted}")
|
| 195 |
+
|
| 196 |
+
# 2f-v7b. REMOVE ps2_is_cliche_topic (53.5% prevalence, no signal)
|
| 197 |
+
if 'ps2_is_cliche_topic' in df.columns:
|
| 198 |
+
df.drop(columns=['ps2_is_cliche_topic'], inplace=True)
|
| 199 |
+
print(f" FIX #6b (V7 NEW): Removed ps2_is_cliche_topic (53.5% prevalence, no signal)")
|
| 200 |
+
|
| 201 |
+
# 2f-v7c. ABLATION: Remove ps_bert_pca if flag is set
|
| 202 |
+
if ABLATE_PS_BERT and ps_bert_cols:
|
| 203 |
+
df.drop(columns=ps_bert_cols, inplace=True)
|
| 204 |
+
print(f" ABLATION: Removed {len(ps_bert_cols)} ps_bert_pca columns")
|
| 205 |
+
|
| 206 |
+
# Also set ps_word_count to NaN for has_ps=0 (it's already 0, but be explicit)
|
| 207 |
+
df.loc[no_ps_mask, 'ps_word_count'] = np.nan
|
| 208 |
+
|
| 209 |
+
# 2g. FIX portfolio_size: log transform + cap (from V2)
|
| 210 |
+
print(f"\n Portfolio size transform:")
|
| 211 |
+
print(f" Before: mean={df['portfolio_size'].mean():.1f}, max={df['portfolio_size'].max():.0f}")
|
| 212 |
+
df['portfolio_size_raw'] = df['portfolio_size'].copy()
|
| 213 |
+
df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
|
| 214 |
+
print(f" After log(clip(x,20)): mean={df['portfolio_size'].mean():.2f}, max={df['portfolio_size'].max():.2f}")
|
| 215 |
+
df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
|
| 216 |
+
bins=[0, 5, 10, 15, 20, 100],
|
| 217 |
+
labels=[0, 1, 2, 3, 4]).astype(int)
|
| 218 |
+
|
| 219 |
+
# 2h. ED2 split (from V2)
|
| 220 |
+
def get_detailed_round(row):
|
| 221 |
+
sid = str(row.get('student_id', '')).replace('.0', '')
|
| 222 |
+
school = str(row.get('school', ''))
|
| 223 |
+
key = f"{sid}_{school}"
|
| 224 |
+
raw_round = round_lookup.get(key, '')
|
| 225 |
+
if 'Early Decision II' in raw_round:
|
| 226 |
+
return 'ED2'
|
| 227 |
+
elif 'Early Decision' in raw_round:
|
| 228 |
+
return 'ED1'
|
| 229 |
+
elif 'Restrictive Early Action' in raw_round:
|
| 230 |
+
return 'REA'
|
| 231 |
+
elif 'Early Action II' in raw_round or 'Early Action' in raw_round:
|
| 232 |
+
return 'EA'
|
| 233 |
+
elif 'Regular Decision' in raw_round:
|
| 234 |
+
return 'RD'
|
| 235 |
+
orig = str(row.get('round_cat', 'RD'))
|
| 236 |
+
if orig == 'ED':
|
| 237 |
+
return 'ED1'
|
| 238 |
+
return orig
|
| 239 |
+
|
| 240 |
+
df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
|
| 241 |
+
print(f"\n Round distribution (v2):")
|
| 242 |
+
print(df['round_cat_v2'].value_counts().to_string())
|
| 243 |
+
|
| 244 |
+
df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
|
| 245 |
+
df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
|
| 246 |
+
df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
|
| 247 |
+
df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
|
| 248 |
+
df['round_cat'] = df['round_cat_v2']
|
| 249 |
+
|
| 250 |
+
# ============================================================
|
| 251 |
+
# 3. PARSE LLM FEATURES
|
| 252 |
+
# ============================================================
|
| 253 |
+
act_scores = {}
|
| 254 |
+
raw = llm_features_loaded.get('act_scores', {})
|
| 255 |
+
if isinstance(raw, list):
|
| 256 |
+
for item in raw:
|
| 257 |
+
if isinstance(item, dict) and item.get('success', False):
|
| 258 |
+
sid_raw = str(item.get('student_id', ''))
|
| 259 |
+
act_scores[sid_raw] = item
|
| 260 |
+
parts = sid_raw.split('_')
|
| 261 |
+
for p in parts:
|
| 262 |
+
clean = p.replace('.0', '')
|
| 263 |
+
if clean.isdigit():
|
| 264 |
+
act_scores[clean] = item
|
| 265 |
+
elif isinstance(raw, dict):
|
| 266 |
+
for sid, scores in raw.items():
|
| 267 |
+
if isinstance(scores, dict):
|
| 268 |
+
act_scores[sid] = scores
|
| 269 |
+
|
| 270 |
+
supp_scores = {}
|
| 271 |
+
raw = llm_features_loaded.get('supp_scores', {})
|
| 272 |
+
if isinstance(raw, list):
|
| 273 |
+
for item in raw:
|
| 274 |
+
if isinstance(item, dict) and item.get('success', False):
|
| 275 |
+
sid = str(item.get('student_id', '')).replace('.0', '')
|
| 276 |
+
school = str(item.get('school', ''))
|
| 277 |
+
key = f"{sid}_{school}"
|
| 278 |
+
oq = item.get('overall_quality', 0)
|
| 279 |
+
if isinstance(oq, (int, float)) and oq <= 1:
|
| 280 |
+
continue
|
| 281 |
+
supp_scores[key] = item
|
| 282 |
+
elif isinstance(raw, dict):
|
| 283 |
+
for key, scores in raw.items():
|
| 284 |
+
if isinstance(scores, dict):
|
| 285 |
+
oq = scores.get('overall_quality', 0)
|
| 286 |
+
if isinstance(oq, (int, float)) and oq <= 1:
|
| 287 |
+
continue
|
| 288 |
+
supp_scores[key] = scores
|
| 289 |
+
print(f" Supp scores after filtering score=1: {len(supp_scores)} valid entries")
|
| 290 |
+
|
| 291 |
+
major_diff = llm_features_loaded.get('major_diff', {})
|
| 292 |
+
if isinstance(major_diff, list):
|
| 293 |
+
major_diff = {}
|
| 294 |
+
|
| 295 |
+
ps_yale = {}
|
| 296 |
+
raw = llm_features_loaded.get('ps_yale', {})
|
| 297 |
+
if isinstance(raw, list):
|
| 298 |
+
for item in raw:
|
| 299 |
+
if isinstance(item, dict):
|
| 300 |
+
sid = str(item.get('student_id', '')).replace('.0', '')
|
| 301 |
+
ps_yale[sid] = item
|
| 302 |
+
elif isinstance(raw, dict):
|
| 303 |
+
ps_yale = raw
|
| 304 |
+
|
| 305 |
+
print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
|
| 306 |
+
|
| 307 |
+
ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
|
| 308 |
+
'n_founder', 'n_president', 'max_scope',
|
| 309 |
+
'has_publication', 'has_patent', 'has_summer_program',
|
| 310 |
+
'summer_program_tier', 'has_olympiad', 'olympiad_level',
|
| 311 |
+
'activity_coherence', 'spike_strength']
|
| 312 |
+
|
| 313 |
+
SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
|
| 314 |
+
'has_imagination_scene', 'mentions_specific_course',
|
| 315 |
+
'mentions_specific_professor', 'mentions_specific_program',
|
| 316 |
+
'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
|
| 317 |
+
|
| 318 |
+
sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
|
| 319 |
+
PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
|
| 320 |
+
and not k.startswith('is_')]
|
| 321 |
+
if not PS_DIMS:
|
| 322 |
+
PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
|
| 323 |
+
'coherence_focus', 'overall_effectiveness']
|
| 324 |
+
|
| 325 |
+
# ============================================================
|
| 326 |
+
# 4. DEFINE FEATURE GROUPS
|
| 327 |
+
# ============================================================
|
| 328 |
+
STUDENT_LEVEL_NUMERIC = [
|
| 329 |
+
'toefl', 'sat', 'gpa',
|
| 330 |
+
'act_total_count', 'act_type_diversity',
|
| 331 |
+
*[f'act_slot_pca_{i}' for i in range(20)],
|
| 332 |
+
*[f'act_bert_pca_{i}' for i in range(16)],
|
| 333 |
+
'honors_max_score', 'honors_avg_score', 'honors_min_score',
|
| 334 |
+
'honors_count', 'honors_total_score',
|
| 335 |
+
'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
|
| 336 |
+
'honors_has_national',
|
| 337 |
+
'honors_quality_ratio',
|
| 338 |
+
'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
|
| 339 |
+
'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
|
| 340 |
+
'cuilu_hs_total',
|
| 341 |
+
'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
|
| 342 |
+
'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
|
| 343 |
+
'hs_overall_hist_rate',
|
| 344 |
+
'summer_max_geili', 'summer_has_elite', 'summer_count',
|
| 345 |
+
'summer_program_count', 'summer_difficulty_max',
|
| 346 |
+
# PS V2 scores (ps2_is_cliche_topic REMOVED in V7)
|
| 347 |
+
'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice', 'ps2_overall', 'ps2_mean',
|
| 348 |
+
'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
|
| 349 |
+
'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
|
| 350 |
+
]
|
| 351 |
+
|
| 352 |
+
# Conditionally include ps_bert_pca (for ablation)
|
| 353 |
+
if not ABLATE_PS_BERT:
|
| 354 |
+
STUDENT_LEVEL_NUMERIC.extend([f'ps_bert_pca_{i}' for i in range(16)])
|
| 355 |
+
|
| 356 |
+
# Identify PS-related features for special school_mean handling
|
| 357 |
+
PS_RELATED_FEATURES = set([
|
| 358 |
+
*[f'ps_bert_pca_{i}' for i in range(16)],
|
| 359 |
+
'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice',
|
| 360 |
+
'ps2_overall', 'ps2_mean',
|
| 361 |
+
'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
|
| 362 |
+
'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
|
| 363 |
+
'ps_word_count',
|
| 364 |
+
])
|
| 365 |
+
|
| 366 |
+
# Add act_type_count columns dynamically
|
| 367 |
+
act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
|
| 368 |
+
STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
|
| 369 |
+
|
| 370 |
+
# Filter to only existing columns
|
| 371 |
+
STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
|
| 372 |
+
print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
|
| 373 |
+
|
| 374 |
+
KEY_STUDENT_FEATURES = [
|
| 375 |
+
'toefl', 'sat', 'gpa',
|
| 376 |
+
'honors_max_score', 'honors_avg_score', 'honors_count',
|
| 377 |
+
'honors_quality_ratio',
|
| 378 |
+
'act_type_diversity', 'act_total_count',
|
| 379 |
+
'hs_to_univ_hist_rate_smoothed',
|
| 380 |
+
'summer_max_geili',
|
| 381 |
+
# PS V2 scores
|
| 382 |
+
'ps2_overall', 'ps2_character_revelation', 'ps2_craft_voice',
|
| 383 |
+
]
|
| 384 |
+
|
| 385 |
+
LLM_INTERACTION_FEATURES = [
|
| 386 |
+
'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
|
| 387 |
+
'supp_mean', 'supp_max', 'ps_mean',
|
| 388 |
+
'major_difficulty',
|
| 389 |
+
# PS V2 scores
|
| 390 |
+
'ps2_mean', 'ps2_overall',
|
| 391 |
+
]
|
| 392 |
+
|
| 393 |
+
# ============================================================
|
| 394 |
+
# 5. BUILD FEATURES
|
| 395 |
+
# ============================================================
|
| 396 |
+
def build_features_base(df):
|
| 397 |
+
"""Build base features WITHOUT residualization."""
|
| 398 |
+
df = df.copy()
|
| 399 |
+
|
| 400 |
+
df['is_partial_year'] = (df['year'] == 2025).astype(int)
|
| 401 |
+
df['year_cat'] = df['year'].astype(str)
|
| 402 |
+
df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
|
| 403 |
+
|
| 404 |
+
# LLM Activity features
|
| 405 |
+
for dim in ACT_DIMS:
|
| 406 |
+
col_name = f'llm_act_{dim}'
|
| 407 |
+
df[col_name] = df['sid_str'].map(
|
| 408 |
+
lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
|
| 409 |
+
|
| 410 |
+
# LLM Supp features
|
| 411 |
+
def get_supp_score(row, dim):
|
| 412 |
+
key = f"{row['sid_str']}_{row['school']}"
|
| 413 |
+
return safe_num(supp_scores.get(key, {}).get(dim, np.nan))
|
| 414 |
+
for dim in SUPP_DIMS:
|
| 415 |
+
col_name = f'supp_{dim}'
|
| 416 |
+
df[col_name] = df.apply(lambda r, d=dim: get_supp_score(r, d), axis=1)
|
| 417 |
+
|
| 418 |
+
# Major difficulty
|
| 419 |
+
def get_major_diff(row):
|
| 420 |
+
key = f"{row['school']}_{row['major_cat']}"
|
| 421 |
+
return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
|
| 422 |
+
df['major_difficulty'] = df.apply(get_major_diff, axis=1)
|
| 423 |
+
|
| 424 |
+
# PS Yale scores
|
| 425 |
+
for dim in PS_DIMS:
|
| 426 |
+
col_name = f'ps_{dim}'
|
| 427 |
+
df[col_name] = df['sid_str'].map(
|
| 428 |
+
lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
|
| 429 |
+
|
| 430 |
+
# Aggregates
|
| 431 |
+
llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
|
| 432 |
+
valid_act = df[llm_act_cols]
|
| 433 |
+
df['llm_act_mean'] = valid_act.mean(axis=1)
|
| 434 |
+
df['llm_act_max'] = valid_act.max(axis=1)
|
| 435 |
+
df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
|
| 436 |
+
|
| 437 |
+
supp_num_cols = [f'supp_{d}' for d in SUPP_DIMS if d not in ['has_red_flag']]
|
| 438 |
+
valid_supp = df[supp_num_cols]
|
| 439 |
+
df['supp_mean'] = valid_supp.mean(axis=1)
|
| 440 |
+
df['supp_max'] = valid_supp.max(axis=1)
|
| 441 |
+
|
| 442 |
+
ps_cols = [f'ps_{d}' for d in PS_DIMS]
|
| 443 |
+
valid_ps = df[ps_cols]
|
| 444 |
+
df['ps_mean'] = valid_ps.mean(axis=1)
|
| 445 |
+
|
| 446 |
+
# Basic interactions
|
| 447 |
+
df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
|
| 448 |
+
df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
|
| 449 |
+
df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
|
| 450 |
+
|
| 451 |
+
if 'honors_avg_score' in df.columns:
|
| 452 |
+
df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
|
| 453 |
+
df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
|
| 454 |
+
|
| 455 |
+
if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
|
| 456 |
+
df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
|
| 457 |
+
|
| 458 |
+
# Categoricals
|
| 459 |
+
cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
|
| 460 |
+
cat_cols = [c for c in cat_cols if c in df.columns]
|
| 461 |
+
|
| 462 |
+
if 'round_cat' in df.columns:
|
| 463 |
+
df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
|
| 464 |
+
cat_cols.append('school_round')
|
| 465 |
+
df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
|
| 466 |
+
cat_cols.append('school_major')
|
| 467 |
+
if 'hs_cat' in df.columns:
|
| 468 |
+
df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
|
| 469 |
+
cat_cols.append('school_hstype')
|
| 470 |
+
|
| 471 |
+
for c in cat_cols:
|
| 472 |
+
df[c] = df[c].fillna('_MISSING_').astype(str)
|
| 473 |
+
le = LabelEncoder()
|
| 474 |
+
df[c] = le.fit_transform(df[c]).astype(int)
|
| 475 |
+
|
| 476 |
+
return df, cat_cols
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
|
| 480 |
+
"""Add residualized + interaction + ED boost features using ONLY training data statistics.
|
| 481 |
+
V7 KEY FIX: For PS-related features, school_mean uses ONLY has_ps=1 rows."""
|
| 482 |
+
df = df.copy()
|
| 483 |
+
|
| 484 |
+
# Step 1: Bayesian-smoothed school_base_rate
|
| 485 |
+
train_df = df[train_mask]
|
| 486 |
+
global_rate = train_df[TARGET].mean()
|
| 487 |
+
|
| 488 |
+
school_stats = train_df.groupby('school').agg(
|
| 489 |
+
school_raw_rate=(TARGET, 'mean'),
|
| 490 |
+
school_n_apps=(TARGET, 'count'),
|
| 491 |
+
school_n_admits=(TARGET, 'sum'),
|
| 492 |
+
).reset_index()
|
| 493 |
+
|
| 494 |
+
SMOOTH_STRENGTH = 30
|
| 495 |
+
school_stats['school_base_rate'] = (
|
| 496 |
+
(school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
|
| 497 |
+
(school_stats['school_n_apps'] + SMOOTH_STRENGTH)
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
|
| 501 |
+
on='school', how='left')
|
| 502 |
+
df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
|
| 503 |
+
df['school_n_apps'] = df['school_n_apps'].fillna(0)
|
| 504 |
+
df['school_n_admits'] = df['school_n_admits'].fillna(0)
|
| 505 |
+
|
| 506 |
+
# Step 1b: ED boost per school
|
| 507 |
+
ed1_mask = train_df['is_ed1'] == 1
|
| 508 |
+
rd_mask = train_df['is_early'] == 0
|
| 509 |
+
|
| 510 |
+
ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
|
| 511 |
+
rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
|
| 512 |
+
|
| 513 |
+
ed_boost_map = {}
|
| 514 |
+
for school in ed1_school_rates.index:
|
| 515 |
+
if school in rd_school_rates.index:
|
| 516 |
+
ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
|
| 517 |
+
df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
|
| 518 |
+
|
| 519 |
+
ed2_mask = train_df['is_ed2'] == 1
|
| 520 |
+
ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
|
| 521 |
+
ed2_boost_map = {}
|
| 522 |
+
for school in ed2_school_rates.index:
|
| 523 |
+
if school in rd_school_rates.index:
|
| 524 |
+
ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
|
| 525 |
+
df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
|
| 526 |
+
|
| 527 |
+
# Step 2: Residualize student features
|
| 528 |
+
# V7 KEY FIX: For PS-related features, compute school_mean using ONLY has_ps=1 training rows
|
| 529 |
+
student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
|
| 530 |
+
|
| 531 |
+
# Pre-compute the has_ps=1 training subset for PS features
|
| 532 |
+
train_has_ps = train_df[train_df['has_ps'] == 1]
|
| 533 |
+
|
| 534 |
+
resid_cols = []
|
| 535 |
+
for col in student_feat_available:
|
| 536 |
+
resid_col = f'{col}_resid'
|
| 537 |
+
|
| 538 |
+
# V7 FIX #7: Use has_ps=1 subset for PS-related features
|
| 539 |
+
if col in PS_RELATED_FEATURES:
|
| 540 |
+
school_mean_series = train_has_ps.groupby('school')[col].mean()
|
| 541 |
+
else:
|
| 542 |
+
school_mean_series = train_df.groupby('school')[col].mean()
|
| 543 |
+
|
| 544 |
+
col_school_mean = df['school'].map(school_mean_series)
|
| 545 |
+
df[resid_col] = df[col] - col_school_mean
|
| 546 |
+
resid_cols.append(resid_col)
|
| 547 |
+
|
| 548 |
+
# Step 2b (V7 NEW): ps2_mean_school_pctile - continuous within-school percentile
|
| 549 |
+
# This solves the granularity problem: ps2_mean has only 17 unique values,
|
| 550 |
+
# but within each school the percentile is continuous
|
| 551 |
+
pctile_ps_cols = []
|
| 552 |
+
if 'ps2_mean' in df.columns:
|
| 553 |
+
ps_pctile_col = 'ps2_mean_school_pctile'
|
| 554 |
+
# Use ONLY has_ps=1 training rows for school distributions
|
| 555 |
+
school_ps_distributions = {}
|
| 556 |
+
for school_id in train_has_ps['school'].unique():
|
| 557 |
+
vals = train_has_ps[train_has_ps['school'] == school_id]['ps2_mean'].dropna().values
|
| 558 |
+
if len(vals) > 2:
|
| 559 |
+
school_ps_distributions[school_id] = vals
|
| 560 |
+
|
| 561 |
+
def compute_ps_pctile(row, sd=school_ps_distributions):
|
| 562 |
+
school_id = row['school']
|
| 563 |
+
val = row['ps2_mean']
|
| 564 |
+
if pd.isna(val) or school_id not in sd:
|
| 565 |
+
return np.nan
|
| 566 |
+
dist = sd[school_id]
|
| 567 |
+
return np.mean(dist <= val)
|
| 568 |
+
|
| 569 |
+
df[ps_pctile_col] = df.apply(compute_ps_pctile, axis=1)
|
| 570 |
+
pctile_ps_cols.append(ps_pctile_col)
|
| 571 |
+
|
| 572 |
+
n_valid = df[ps_pctile_col].notna().sum()
|
| 573 |
+
n_unique = df[ps_pctile_col].nunique()
|
| 574 |
+
print(f" V7 NEW: {ps_pctile_col}: {n_valid} valid, {n_unique} unique values")
|
| 575 |
+
|
| 576 |
+
# Step 3: Explicit interactions (student feature x school_base_rate)
|
| 577 |
+
interaction_cols = []
|
| 578 |
+
for col in KEY_STUDENT_FEATURES:
|
| 579 |
+
if col in df.columns:
|
| 580 |
+
int_col = f'{col}_x_school_rate'
|
| 581 |
+
df[int_col] = df[col] * df['school_base_rate']
|
| 582 |
+
interaction_cols.append(int_col)
|
| 583 |
+
|
| 584 |
+
resid_col = f'{col}_resid'
|
| 585 |
+
if resid_col in df.columns:
|
| 586 |
+
int_resid_col = f'{col}_resid_x_rate'
|
| 587 |
+
df[int_resid_col] = df[resid_col] * df['school_base_rate']
|
| 588 |
+
interaction_cols.append(int_resid_col)
|
| 589 |
+
|
| 590 |
+
# Step 3b: LLM feature x school_base_rate interactions
|
| 591 |
+
for col in LLM_INTERACTION_FEATURES:
|
| 592 |
+
if col in df.columns:
|
| 593 |
+
int_col = f'{col}_x_school_rate'
|
| 594 |
+
df[int_col] = df[col] * df['school_base_rate']
|
| 595 |
+
interaction_cols.append(int_col)
|
| 596 |
+
|
| 597 |
+
# Step 3c: portfolio_size x school_base_rate interaction
|
| 598 |
+
if 'portfolio_size' in df.columns:
|
| 599 |
+
df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
|
| 600 |
+
interaction_cols.append('portfolio_x_school_rate')
|
| 601 |
+
|
| 602 |
+
# Step 3d: ED flag x school_ed_boost interaction
|
| 603 |
+
if 'is_ed1' in df.columns:
|
| 604 |
+
df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
|
| 605 |
+
interaction_cols.append('ed1_x_ed_boost')
|
| 606 |
+
if 'is_ed2' in df.columns:
|
| 607 |
+
df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
|
| 608 |
+
interaction_cols.append('ed2_x_ed2_boost')
|
| 609 |
+
|
| 610 |
+
# Step 3e: has_sat/has_toefl/has_gpa interactions with school_base_rate
|
| 611 |
+
for flag in ['has_sat', 'has_toefl', 'has_gpa']:
|
| 612 |
+
if flag in df.columns:
|
| 613 |
+
int_col = f'{flag}_x_school_rate'
|
| 614 |
+
df[int_col] = df[flag] * df['school_base_rate']
|
| 615 |
+
interaction_cols.append(int_col)
|
| 616 |
+
|
| 617 |
+
# Step 3f (V7 NEW): ps2_mean_school_pctile x school_base_rate
|
| 618 |
+
if 'ps2_mean_school_pctile' in df.columns:
|
| 619 |
+
df['ps2_pctile_x_school_rate'] = df['ps2_mean_school_pctile'] * df['school_base_rate']
|
| 620 |
+
interaction_cols.append('ps2_pctile_x_school_rate')
|
| 621 |
+
|
| 622 |
+
# Step 4: Student percentile within school (NaN-safe)
|
| 623 |
+
pctile_cols = []
|
| 624 |
+
for col in ['toefl', 'sat', 'gpa', 'honors_max_score',
|
| 625 |
+
'llm_act_mean', 'supp_mean']:
|
| 626 |
+
if col not in df.columns:
|
| 627 |
+
continue
|
| 628 |
+
pctile_col = f'{col}_school_pctile'
|
| 629 |
+
school_distributions = {}
|
| 630 |
+
for school_id in train_df['school'].unique():
|
| 631 |
+
vals = train_df[train_df['school'] == school_id][col].dropna().values
|
| 632 |
+
if len(vals) > 2:
|
| 633 |
+
school_distributions[school_id] = vals
|
| 634 |
+
|
| 635 |
+
def compute_pctile(row, col=col, sd=school_distributions):
|
| 636 |
+
school_id = row['school']
|
| 637 |
+
val = row[col]
|
| 638 |
+
if pd.isna(val) or school_id not in sd:
|
| 639 |
+
return np.nan
|
| 640 |
+
dist = sd[school_id]
|
| 641 |
+
return np.mean(dist <= val)
|
| 642 |
+
|
| 643 |
+
df[pctile_col] = df.apply(compute_pctile, axis=1)
|
| 644 |
+
pctile_cols.append(pctile_col)
|
| 645 |
+
|
| 646 |
+
# Merge ps2 pctile into pctile_cols for reporting
|
| 647 |
+
pctile_cols.extend(pctile_ps_cols)
|
| 648 |
+
|
| 649 |
+
# Step 5: Student competitiveness score (NaN-safe)
|
| 650 |
+
if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
|
| 651 |
+
components = []
|
| 652 |
+
weights = []
|
| 653 |
+
for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
|
| 654 |
+
('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
|
| 655 |
+
if col in df.columns:
|
| 656 |
+
components.append(df[col] / scale)
|
| 657 |
+
weights.append(w)
|
| 658 |
+
if components:
|
| 659 |
+
strength_df = pd.DataFrame(components).T
|
| 660 |
+
df['student_strength'] = strength_df.mean(axis=1)
|
| 661 |
+
df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
|
| 662 |
+
|
| 663 |
+
# Build final feature list
|
| 664 |
+
num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
|
| 665 |
+
and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
|
| 666 |
+
|
| 667 |
+
all_feat = list(set(num_cols + cat_cols))
|
| 668 |
+
feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
|
| 669 |
+
for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
|
| 670 |
+
if remove in feature_cols:
|
| 671 |
+
feature_cols.remove(remove)
|
| 672 |
+
|
| 673 |
+
# Remove constant columns
|
| 674 |
+
to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
|
| 675 |
+
feature_cols = [c for c in feature_cols if c not in to_drop]
|
| 676 |
+
|
| 677 |
+
# Apply feature selection if provided
|
| 678 |
+
if selected_features is not None:
|
| 679 |
+
must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
|
| 680 |
+
'student_strength', 'strength_vs_school',
|
| 681 |
+
'school_ed_boost', 'school_ed2_boost',
|
| 682 |
+
'is_ed1', 'is_ed2', 'is_rea', 'is_early',
|
| 683 |
+
'ed1_x_ed_boost', 'ed2_x_ed2_boost',
|
| 684 |
+
'has_sat', 'has_toefl', 'has_gpa',
|
| 685 |
+
'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate',
|
| 686 |
+
# V7: always keep new PS features
|
| 687 |
+
'ps2_mean_school_pctile', 'ps2_pctile_x_school_rate'}
|
| 688 |
+
feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
|
| 689 |
+
|
| 690 |
+
# Handle inf
|
| 691 |
+
for c in feature_cols:
|
| 692 |
+
if df[c].dtype in ['float64', 'float32']:
|
| 693 |
+
df[c] = df[c].replace([np.inf, -np.inf], np.nan)
|
| 694 |
+
|
| 695 |
+
cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
|
| 696 |
+
|
| 697 |
+
new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + 5
|
| 698 |
+
print(f" Resid features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile = {new_feat_count} new, total={len(feature_cols)}")
|
| 699 |
+
|
| 700 |
+
return df, feature_cols, cat_cols, cat_indices
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
# ============================================================
|
| 704 |
+
# 6. BUILD BASE FEATURES
|
| 705 |
+
# ============================================================
|
| 706 |
+
df_base, cat_cols = build_features_base(df)
|
| 707 |
+
print(f"\nBase features built. Shape: {df_base.shape}")
|
| 708 |
+
|
| 709 |
+
# Quick NaN summary
|
| 710 |
+
print(f"\n NaN summary after fixes:")
|
| 711 |
+
for col in ['sat', 'toefl', 'gpa', 'ps2_mean', 'ps2_overall']:
|
| 712 |
+
if col in df_base.columns:
|
| 713 |
+
nan_pct = df_base[col].isna().mean() * 100
|
| 714 |
+
print(f" {col}: {nan_pct:.1f}% NaN")
|
| 715 |
+
|
| 716 |
+
# V7: Verify ps2 cleanup
|
| 717 |
+
no_ps_check = df_base[df_base['has_ps'] == 0]
|
| 718 |
+
if 'ps2_mean' in df_base.columns:
|
| 719 |
+
ps2_polluted = no_ps_check['ps2_mean'].notna().sum()
|
| 720 |
+
print(f"\n V7 VERIFY: ps2_mean non-NaN for has_ps=0: {ps2_polluted} (should be 0)")
|
| 721 |
+
|
| 722 |
+
y = df_base[TARGET].values
|
| 723 |
+
groups = df_base['student_id'].values
|
| 724 |
+
|
| 725 |
+
# ============================================================
|
| 726 |
+
# 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
|
| 727 |
+
# ============================================================
|
| 728 |
+
print(f"\n{'='*70}")
|
| 729 |
+
print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
|
| 730 |
+
print(f"{'='*70}")
|
| 731 |
+
|
| 732 |
+
stage1_fi = []
|
| 733 |
+
gkf_s1 = GroupKFold(n_splits=5)
|
| 734 |
+
for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
|
| 735 |
+
train_mask = pd.Series(False, index=df_base.index)
|
| 736 |
+
train_mask.iloc[tr_idx] = True
|
| 737 |
+
|
| 738 |
+
df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
|
| 739 |
+
df_base, train_mask, cat_cols)
|
| 740 |
+
|
| 741 |
+
X_tr = df_fold[feat_cols_f].iloc[tr_idx]
|
| 742 |
+
X_va = df_fold[feat_cols_f].iloc[va_idx]
|
| 743 |
+
y_tr = y[tr_idx]
|
| 744 |
+
y_va = y[va_idx]
|
| 745 |
+
|
| 746 |
+
for c in cat_cols_f:
|
| 747 |
+
if c in X_tr.columns:
|
| 748 |
+
X_tr[c] = X_tr[c].astype(int)
|
| 749 |
+
X_va[c] = X_va[c].astype(int)
|
| 750 |
+
|
| 751 |
+
cb = CatBoostClassifier(
|
| 752 |
+
iterations=500, depth=6, learning_rate=0.05,
|
| 753 |
+
l2_leaf_reg=7, random_seed=42, verbose=0,
|
| 754 |
+
cat_features=cat_idx_f, eval_metric='AUC',
|
| 755 |
+
early_stopping_rounds=50)
|
| 756 |
+
pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
|
| 757 |
+
pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
|
| 758 |
+
cb.fit(pool_tr, eval_set=pool_va, verbose=0)
|
| 759 |
+
|
| 760 |
+
fi = cb.get_feature_importance()
|
| 761 |
+
stage1_fi.append(fi)
|
| 762 |
+
|
| 763 |
+
auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
|
| 764 |
+
print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
|
| 765 |
+
|
| 766 |
+
if fold == 0:
|
| 767 |
+
all_feature_names = feat_cols_f
|
| 768 |
+
|
| 769 |
+
del cb, pool_tr, pool_va, df_fold; gc.collect()
|
| 770 |
+
|
| 771 |
+
# Select top features
|
| 772 |
+
avg_fi = np.mean(stage1_fi, axis=0)
|
| 773 |
+
fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
|
| 774 |
+
|
| 775 |
+
selected_set = set(cat_cols)
|
| 776 |
+
n_added = 0
|
| 777 |
+
for fname, imp in fi_pairs:
|
| 778 |
+
if fname not in cat_cols:
|
| 779 |
+
selected_set.add(fname)
|
| 780 |
+
n_added += 1
|
| 781 |
+
if n_added >= FEATURE_SELECT_TOP_N:
|
| 782 |
+
break
|
| 783 |
+
|
| 784 |
+
print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
|
| 785 |
+
print(f" Top 30 features:")
|
| 786 |
+
for i, (fname, imp) in enumerate(fi_pairs[:30]):
|
| 787 |
+
marker = ""
|
| 788 |
+
if '_resid' in fname: marker = " [R]"
|
| 789 |
+
elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
|
| 790 |
+
elif '_school_pctile' in fname: marker = " [P]"
|
| 791 |
+
elif 'school_base_rate' in fname: marker = " [S]"
|
| 792 |
+
elif 'ed_boost' in fname: marker = " [ED]"
|
| 793 |
+
elif 'ps2_' in fname: marker = " [PS2]"
|
| 794 |
+
print(f" {i+1:3d}. {fname:<50s} {imp:>8.2f}{marker}")
|
| 795 |
+
|
| 796 |
+
# ============================================================
|
| 797 |
+
# 8. TEMPORAL VALIDATION WITH SELECTED FEATURES
|
| 798 |
+
# ============================================================
|
| 799 |
+
print(f"\n{'='*70}")
|
| 800 |
+
print(f" TEMPORAL VALIDATION (2020-2023 -> 2024) WITH FEATURE SELECTION")
|
| 801 |
+
print(f"{'='*70}")
|
| 802 |
+
|
| 803 |
+
mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
|
| 804 |
+
mask_test_temporal = df_base['year'] == 2024
|
| 805 |
+
|
| 806 |
+
temporal_results = {}
|
| 807 |
+
if mask_test_temporal.sum() > 0:
|
| 808 |
+
df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
|
| 809 |
+
df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
|
| 810 |
+
|
| 811 |
+
X_t = df_temporal[feat_cols_t].copy()
|
| 812 |
+
for c in cat_cols_t:
|
| 813 |
+
if c in X_t.columns:
|
| 814 |
+
X_t[c] = X_t[c].astype(int)
|
| 815 |
+
|
| 816 |
+
X_tr_t = X_t[mask_train_temporal]
|
| 817 |
+
X_te_t = X_t[mask_test_temporal]
|
| 818 |
+
y_tr_t = y[mask_train_temporal]
|
| 819 |
+
y_te_t = y[mask_test_temporal]
|
| 820 |
+
|
| 821 |
+
X_tr_t_filled = X_tr_t.fillna(-999)
|
| 822 |
+
X_te_t_filled = X_te_t.fillna(-999)
|
| 823 |
+
|
| 824 |
+
print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
|
| 825 |
+
|
| 826 |
+
for seed in SEEDS:
|
| 827 |
+
cb_t = CatBoostClassifier(
|
| 828 |
+
iterations=1000, depth=6, learning_rate=0.03,
|
| 829 |
+
l2_leaf_reg=7, random_seed=seed, verbose=0,
|
| 830 |
+
cat_features=cat_idx_t, eval_metric='AUC',
|
| 831 |
+
early_stopping_rounds=100, min_data_in_leaf=10)
|
| 832 |
+
pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
|
| 833 |
+
pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
|
| 834 |
+
cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
|
| 835 |
+
cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
|
| 836 |
+
del cb_t; gc.collect()
|
| 837 |
+
|
| 838 |
+
lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
|
| 839 |
+
lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
|
| 840 |
+
lgb_params = {
|
| 841 |
+
'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
|
| 842 |
+
'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
|
| 843 |
+
'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
|
| 844 |
+
'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
|
| 845 |
+
'seed': seed
|
| 846 |
+
}
|
| 847 |
+
lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
|
| 848 |
+
valid_sets=[lgb_va],
|
| 849 |
+
callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
|
| 850 |
+
lgb_pred = lgb_model.predict(X_te_t_filled.values)
|
| 851 |
+
del lgb_model; gc.collect()
|
| 852 |
+
|
| 853 |
+
dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
|
| 854 |
+
dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
|
| 855 |
+
xgb_params = {
|
| 856 |
+
'objective': 'binary:logistic', 'eval_metric': 'auc',
|
| 857 |
+
'max_depth': 6, 'learning_rate': 0.03,
|
| 858 |
+
'subsample': 0.8, 'colsample_bytree': 0.7,
|
| 859 |
+
'reg_alpha': 0.3, 'reg_lambda': 2.0,
|
| 860 |
+
'min_child_weight': 5,
|
| 861 |
+
'seed': seed, 'verbosity': 0
|
| 862 |
+
}
|
| 863 |
+
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
|
| 864 |
+
evals=[(dtest, 'val')],
|
| 865 |
+
early_stopping_rounds=100, verbose_eval=False)
|
| 866 |
+
xgb_pred = xgb_model.predict(dtest)
|
| 867 |
+
del xgb_model, dtrain, dtest; gc.collect()
|
| 868 |
+
|
| 869 |
+
blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
|
| 870 |
+
temporal_results[seed] = {
|
| 871 |
+
'cb': float(roc_auc_score(y_te_t, cb_pred)),
|
| 872 |
+
'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
|
| 873 |
+
'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
|
| 874 |
+
'blend': float(roc_auc_score(y_te_t, blend))
|
| 875 |
+
}
|
| 876 |
+
print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
|
| 877 |
+
|
| 878 |
+
avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
|
| 879 |
+
print(f"\n AVG Temporal Blend: {avg_temporal:.4f}")
|
| 880 |
+
print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
|
| 881 |
+
print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
|
| 882 |
+
print(f" Delta vs V38.2-PRO-V6: {avg_temporal - 0.8543:+.4f}")
|
| 883 |
+
|
| 884 |
+
del df_temporal, X_t; gc.collect()
|
| 885 |
+
else:
|
| 886 |
+
avg_temporal = 0.0
|
| 887 |
+
|
| 888 |
+
# ============================================================
|
| 889 |
+
# 9. STAGE 2: MULTI-SEED GROUPKFOLD
|
| 890 |
+
# ============================================================
|
| 891 |
+
print(f"\n{'='*70}")
|
| 892 |
+
print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
|
| 893 |
+
print(f"{'='*70}")
|
| 894 |
+
|
| 895 |
+
all_cb_oof = []
|
| 896 |
+
all_lgb_oof = []
|
| 897 |
+
all_xgb_oof = []
|
| 898 |
+
all_fi = []
|
| 899 |
+
feature_cols_final = None
|
| 900 |
+
|
| 901 |
+
for seed_idx, seed in enumerate(SEEDS):
|
| 902 |
+
print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
|
| 903 |
+
gkf = GroupKFold(n_splits=N_FOLDS)
|
| 904 |
+
cb_oof = np.zeros(len(df_base))
|
| 905 |
+
lgb_oof = np.zeros(len(df_base))
|
| 906 |
+
xgb_oof = np.zeros(len(df_base))
|
| 907 |
+
|
| 908 |
+
for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
|
| 909 |
+
train_mask = pd.Series(False, index=df_base.index)
|
| 910 |
+
train_mask.iloc[tr_idx] = True
|
| 911 |
+
|
| 912 |
+
df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
|
| 913 |
+
df_base, train_mask, cat_cols, selected_features=selected_set)
|
| 914 |
+
|
| 915 |
+
if feature_cols_final is None:
|
| 916 |
+
feature_cols_final = feat_cols_f
|
| 917 |
+
print(f" Total features after selection: {len(feat_cols_f)}")
|
| 918 |
+
|
| 919 |
+
X_fold = df_fold[feat_cols_f].copy()
|
| 920 |
+
for c in cat_cols_f:
|
| 921 |
+
if c in X_fold.columns:
|
| 922 |
+
X_fold[c] = X_fold[c].astype(int)
|
| 923 |
+
|
| 924 |
+
X_tr_df = X_fold.iloc[tr_idx]
|
| 925 |
+
X_va_df = X_fold.iloc[va_idx]
|
| 926 |
+
y_tr = y[tr_idx]
|
| 927 |
+
y_va = y[va_idx]
|
| 928 |
+
|
| 929 |
+
# CatBoost: native NaN
|
| 930 |
+
cb = CatBoostClassifier(
|
| 931 |
+
iterations=1500, depth=6, learning_rate=0.03,
|
| 932 |
+
l2_leaf_reg=7, random_seed=seed, verbose=0,
|
| 933 |
+
cat_features=cat_idx_f, eval_metric='AUC',
|
| 934 |
+
early_stopping_rounds=100, min_data_in_leaf=10)
|
| 935 |
+
pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
|
| 936 |
+
pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
|
| 937 |
+
cb.fit(pool_tr, eval_set=pool_va, verbose=0)
|
| 938 |
+
cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
|
| 939 |
+
cb_oof[va_idx] = cb_pred
|
| 940 |
+
|
| 941 |
+
if fold == N_FOLDS - 1:
|
| 942 |
+
all_fi.append(cb.get_feature_importance())
|
| 943 |
+
del cb, pool_tr, pool_va; gc.collect()
|
| 944 |
+
|
| 945 |
+
# LGB/XGB: fill NaN
|
| 946 |
+
X_tr_filled = X_tr_df.fillna(-999).values
|
| 947 |
+
X_va_filled = X_va_df.fillna(-999).values
|
| 948 |
+
|
| 949 |
+
lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
|
| 950 |
+
lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
|
| 951 |
+
lgb_params = {
|
| 952 |
+
'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
|
| 953 |
+
'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
|
| 954 |
+
'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
|
| 955 |
+
'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
|
| 956 |
+
'seed': seed
|
| 957 |
+
}
|
| 958 |
+
lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
|
| 959 |
+
valid_sets=[lgb_va_ds],
|
| 960 |
+
callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
|
| 961 |
+
lgb_pred = lgb_model.predict(X_va_filled)
|
| 962 |
+
lgb_oof[va_idx] = lgb_pred
|
| 963 |
+
del lgb_model; gc.collect()
|
| 964 |
+
|
| 965 |
+
dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
|
| 966 |
+
dval = xgb.DMatrix(X_va_filled, label=y_va)
|
| 967 |
+
xgb_params = {
|
| 968 |
+
'objective': 'binary:logistic', 'eval_metric': 'auc',
|
| 969 |
+
'max_depth': 6, 'learning_rate': 0.03,
|
| 970 |
+
'subsample': 0.8, 'colsample_bytree': 0.7,
|
| 971 |
+
'reg_alpha': 0.3, 'reg_lambda': 2.0,
|
| 972 |
+
'min_child_weight': 5,
|
| 973 |
+
'seed': seed, 'verbosity': 0
|
| 974 |
+
}
|
| 975 |
+
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
|
| 976 |
+
evals=[(dval, 'val')],
|
| 977 |
+
early_stopping_rounds=100, verbose_eval=False)
|
| 978 |
+
xgb_pred = xgb_model.predict(dval)
|
| 979 |
+
xgb_oof[va_idx] = xgb_pred
|
| 980 |
+
del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
|
| 981 |
+
|
| 982 |
+
if (fold + 1) % 5 == 0:
|
| 983 |
+
print(f" Fold {fold+1}/{N_FOLDS} done")
|
| 984 |
+
|
| 985 |
+
cb_auc = roc_auc_score(y, cb_oof)
|
| 986 |
+
lgb_auc = roc_auc_score(y, lgb_oof)
|
| 987 |
+
xgb_auc = roc_auc_score(y, xgb_oof)
|
| 988 |
+
print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
|
| 989 |
+
|
| 990 |
+
all_cb_oof.append(cb_oof)
|
| 991 |
+
all_lgb_oof.append(lgb_oof)
|
| 992 |
+
all_xgb_oof.append(xgb_oof)
|
| 993 |
+
|
| 994 |
+
# ============================================================
|
| 995 |
+
# 10. ENSEMBLE & BLEND
|
| 996 |
+
# ============================================================
|
| 997 |
+
print(f"\n{'='*70}")
|
| 998 |
+
print(f" ENSEMBLE RESULTS")
|
| 999 |
+
print(f"{'='*70}")
|
| 1000 |
+
|
| 1001 |
+
cb_avg = np.mean(all_cb_oof, axis=0)
|
| 1002 |
+
lgb_avg = np.mean(all_lgb_oof, axis=0)
|
| 1003 |
+
xgb_avg = np.mean(all_xgb_oof, axis=0)
|
| 1004 |
+
|
| 1005 |
+
cb_final_auc = roc_auc_score(y, cb_avg)
|
| 1006 |
+
lgb_final_auc = roc_auc_score(y, lgb_avg)
|
| 1007 |
+
xgb_final_auc = roc_auc_score(y, xgb_avg)
|
| 1008 |
+
|
| 1009 |
+
print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
|
| 1010 |
+
print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
|
| 1011 |
+
print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
|
| 1012 |
+
|
| 1013 |
+
best_auc = 0
|
| 1014 |
+
best_weights = (0.45, 0.20, 0.35)
|
| 1015 |
+
for w_cb in np.arange(0.2, 0.7, 0.05):
|
| 1016 |
+
for w_lgb in np.arange(0.05, 0.5, 0.05):
|
| 1017 |
+
w_xgb = 1.0 - w_cb - w_lgb
|
| 1018 |
+
if w_xgb < 0.05: continue
|
| 1019 |
+
blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
|
| 1020 |
+
auc = roc_auc_score(y, blend)
|
| 1021 |
+
if auc > best_auc:
|
| 1022 |
+
best_auc = auc
|
| 1023 |
+
best_weights = (w_cb, w_lgb, w_xgb)
|
| 1024 |
+
|
| 1025 |
+
print(f"\n Best 3-model blend: {best_auc:.4f}")
|
| 1026 |
+
print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
|
| 1027 |
+
print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
|
| 1028 |
+
print(f" Delta vs V38.2-PRO-V6: {best_auc - 0.8760:+.4f}")
|
| 1029 |
+
print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
|
| 1030 |
+
|
| 1031 |
+
rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
|
| 1032 |
+
rank_auc = roc_auc_score(y, rank_blend)
|
| 1033 |
+
print(f" Rank blend: {rank_auc:.4f}")
|
| 1034 |
+
|
| 1035 |
+
final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
|
| 1036 |
+
final_auc = roc_auc_score(y, final_blend_prob)
|
| 1037 |
+
final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
|
| 1038 |
+
final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
|
| 1039 |
+
|
| 1040 |
+
print(f"\n FINAL METRICS:")
|
| 1041 |
+
print(f" AUC: {final_auc:.4f} (V38.2-PRO-V4: 0.8758, V38.2-PRO-V6: 0.8760)")
|
| 1042 |
+
print(f" Brier: {final_brier:.4f}")
|
| 1043 |
+
print(f" LogLoss: {final_logloss:.4f}")
|
| 1044 |
+
|
| 1045 |
+
# ============================================================
|
| 1046 |
+
# 11. FEATURE IMPORTANCE
|
| 1047 |
+
# ============================================================
|
| 1048 |
+
print(f"\n{'='*70}")
|
| 1049 |
+
print(f" FEATURE IMPORTANCE (avg across seeds)")
|
| 1050 |
+
print(f"{'='*70}")
|
| 1051 |
+
|
| 1052 |
+
if feature_cols_final and all_fi:
|
| 1053 |
+
avg_fi = np.mean(all_fi, axis=0)
|
| 1054 |
+
fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
|
| 1055 |
+
|
| 1056 |
+
print(f" {'Rank':<5s} {'Feature':<50s} {'Importance':>10s}")
|
| 1057 |
+
print(f" {'-'*5} {'-'*50} {'-'*10}")
|
| 1058 |
+
for i, (fname, imp) in enumerate(fi_pairs[:50]):
|
| 1059 |
+
marker = ""
|
| 1060 |
+
if '_resid' in fname: marker = " [RESID]"
|
| 1061 |
+
elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [INTERACT]"
|
| 1062 |
+
elif '_school_pctile' in fname: marker = " [PCTILE]"
|
| 1063 |
+
elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
|
| 1064 |
+
elif 'ed_boost' in fname or 'ed2_boost' in fname: marker = " [ED_BOOST]"
|
| 1065 |
+
elif fname.startswith('has_'): marker = " [FLAG]"
|
| 1066 |
+
elif 'ps2_' in fname: marker = " [PS2_V7]"
|
| 1067 |
+
print(f" {i+1:<5d} {fname:<50s} {imp:>10.2f}{marker}")
|
| 1068 |
+
|
| 1069 |
+
# Count PS-related features in top 30
|
| 1070 |
+
ps_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'ps2_' in f or 'ps_bert' in f or 'ps_mean' in f)
|
| 1071 |
+
print(f"\n PS-related features in top 30: {ps_in_top30}")
|
| 1072 |
+
|
| 1073 |
+
# ============================================================
|
| 1074 |
+
# 12. SAVE RESULTS
|
| 1075 |
+
# ============================================================
|
| 1076 |
+
elapsed = time.time() - start_time
|
| 1077 |
+
|
| 1078 |
+
results = {
|
| 1079 |
+
'version': 'V38.2-pro-v7',
|
| 1080 |
+
'ablation': {'ABLATE_PS_BERT': ABLATE_PS_BERT},
|
| 1081 |
+
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
|
| 1082 |
+
'elapsed_minutes': elapsed / 60,
|
| 1083 |
+
'changes': [
|
| 1084 |
+
'FIX #6: has_ps=0 -> ALL ps2_* scores NaN (was 5057 polluted rows)',
|
| 1085 |
+
'FIX #7: Residualization school_mean for PS features uses ONLY has_ps=1 rows',
|
| 1086 |
+
'NEW: ps2_mean_school_pctile (continuous within-school percentile)',
|
| 1087 |
+
'REMOVE: ps2_is_cliche_topic (53.5% prevalence, no signal)',
|
| 1088 |
+
f'ABLATION: ABLATE_PS_BERT={ABLATE_PS_BERT}',
|
| 1089 |
+
'All V6 fixes carried forward',
|
| 1090 |
+
],
|
| 1091 |
+
'comparison': {
|
| 1092 |
+
'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
|
| 1093 |
+
'v38_2_pro_v4': {'auc': 0.8758, 'temporal_auc': 0.8555},
|
| 1094 |
+
'v38_2_pro_v6': {'auc': 0.8760, 'temporal_auc': 0.8543},
|
| 1095 |
+
},
|
| 1096 |
+
'temporal_validation': {
|
| 1097 |
+
'per_seed': temporal_results,
|
| 1098 |
+
'avg_blend': float(avg_temporal),
|
| 1099 |
+
},
|
| 1100 |
+
'groupkfold': {
|
| 1101 |
+
'best_3model_blend': float(best_auc),
|
| 1102 |
+
'best_weights': [float(w) for w in best_weights],
|
| 1103 |
+
'rank_blend': float(rank_auc),
|
| 1104 |
+
},
|
| 1105 |
+
'final_metrics': {
|
| 1106 |
+
'auc': float(final_auc),
|
| 1107 |
+
'brier': float(final_brier),
|
| 1108 |
+
'logloss': float(final_logloss),
|
| 1109 |
+
},
|
| 1110 |
+
'n_features': len(feature_cols_final) if feature_cols_final else 0,
|
| 1111 |
+
'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
|
| 1112 |
+
}
|
| 1113 |
+
|
| 1114 |
+
suffix = '_ablate_ps_bert' if ABLATE_PS_BERT else ''
|
| 1115 |
+
with open(os.path.join(OUTPUT_DIR, f'v38_2_pro_v7{suffix}_results.json'), 'w') as f:
|
| 1116 |
+
json.dump(results, f, indent=2)
|
| 1117 |
+
|
| 1118 |
+
oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
|
| 1119 |
+
oof_df['cb_pred'] = cb_avg
|
| 1120 |
+
oof_df['lgb_pred'] = lgb_avg
|
| 1121 |
+
oof_df['xgb_pred'] = xgb_avg
|
| 1122 |
+
oof_df['final_pred'] = final_blend_prob
|
| 1123 |
+
oof_df.to_csv(os.path.join(OUTPUT_DIR, f'v38_2_pro_v7{suffix}_oof_predictions.csv'), index=False)
|
| 1124 |
+
|
| 1125 |
+
print(f"\n{'='*70}")
|
| 1126 |
+
print(f" V38.2-PRO-V7 COMPLETE (ABLATE_PS_BERT={ABLATE_PS_BERT})")
|
| 1127 |
+
print(f" Total time: {elapsed/60:.1f} minutes")
|
| 1128 |
+
print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
|
| 1129 |
+
print(f" GroupKFold AUC: {final_auc:.4f} (V38.2-PRO-V4: 0.8758, V38.2-PRO-V6: 0.8760)")
|
| 1130 |
+
print(f" Temporal AUC: {avg_temporal:.4f} (V38.2-PRO-V4: 0.8555, V38.2-PRO-V6: 0.8543)")
|
| 1131 |
+
print(f"{'='*70}")
|