catninja123 commited on
Commit
0cf69e6
·
verified ·
1 Parent(s): ea3ac07

V7: PS data quality fix + school pctile + ablation

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v7.py +1131 -0
train_v38_2_pro_v7.py ADDED
@@ -0,0 +1,1131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V7 MODEL - PS Data Quality Fix + School Pctile + Ablation
4
+ ====================================================================
5
+ Changes from V38.2-PRO-V6:
6
+ 1. FIX #6: has_ps=0 -> ALL ps2_* scores NaN (5057 rows were polluted)
7
+ 2. FIX #7: Residualization school_mean for PS features uses ONLY has_ps=1 rows
8
+ 3. NEW: ps2_mean_school_pctile (continuous within-school percentile, solves granularity)
9
+ 4. REMOVE: ps2_is_cliche_topic (53.5% prevalence, no signal)
10
+ 5. ABLATION: ABLATE_PS_BERT flag to test removing ps_bert_pca 16 dims
11
+ 6. All V6 fixes carried forward
12
+ ====================================================================
13
+ """
14
+ import pandas as pd
15
+ import numpy as np
16
+ import json, os, warnings, sys, time, pickle, gc
17
+ warnings.filterwarnings('ignore')
18
+ from sklearn.model_selection import GroupKFold
19
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
20
+ from sklearn.preprocessing import LabelEncoder
21
+ from scipy.stats import rankdata
22
+
23
+ try:
24
+ from catboost import CatBoostClassifier, Pool
25
+ import lightgbm as lgb
26
+ import xgboost as xgb
27
+ print("All model libraries loaded successfully")
28
+ except ImportError as e:
29
+ print(f"Missing library: {e}")
30
+ import subprocess
31
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
32
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
33
+ from catboost import CatBoostClassifier, Pool
34
+ import lightgbm as lgb
35
+ import xgboost as xgb
36
+
37
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
38
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
39
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
40
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
41
+
42
+ TARGET = 'target'
43
+ SEEDS = [42, 123, 456, 789, 2024]
44
+ N_FOLDS = 10
45
+ FEATURE_SELECT_TOP_N = 150
46
+ start_time = time.time()
47
+
48
+ # ============================================================
49
+ # ABLATION FLAGS - set to True to remove feature groups
50
+ # ============================================================
51
+ ABLATE_PS_BERT = False # Set True to remove ps_bert_pca_0..15 (16 dims)
52
+
53
+ def safe_num(v, default=np.nan):
54
+ """Convert to float, return NaN for missing (was -1 before)."""
55
+ if isinstance(v, (int, float)):
56
+ val = float(v)
57
+ return np.nan if val == -1 else val
58
+ if isinstance(v, str):
59
+ try:
60
+ val = float(v)
61
+ return np.nan if val == -1 else val
62
+ except:
63
+ return default
64
+ return default
65
+
66
+ # ============================================================
67
+ # 1. LOAD DATA (v8 feature matrix)
68
+ # ============================================================
69
+ print("=" * 70)
70
+ print(" V38.2-PRO-V7: PS DATA QUALITY FIX + SCHOOL PCTILE + ABLATION")
71
+ print("=" * 70)
72
+ print(f" ABLATE_PS_BERT = {ABLATE_PS_BERT}")
73
+
74
+ # Try v8 first, fall back to v6, then v5
75
+ v8_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v8.csv')
76
+ v6_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v6.csv')
77
+ v5_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v5.csv')
78
+ if os.path.exists(v8_path):
79
+ df_raw = pd.read_csv(v8_path)
80
+ print(f"V8 features loaded: {df_raw.shape}")
81
+ elif os.path.exists(v6_path):
82
+ df_raw = pd.read_csv(v6_path)
83
+ print(f"V6 features loaded (v8 not found): {df_raw.shape}")
84
+ else:
85
+ df_raw = pd.read_csv(v5_path)
86
+ print(f"V5 features loaded: {df_raw.shape}")
87
+
88
+ # Load LLM features
89
+ llm_features_loaded = {}
90
+ for fname, varname in [
91
+ ('llm_activity_scores.json', 'act_scores'),
92
+ ('llm_supp_quality_all.json', 'supp_scores'),
93
+ ('llm_major_difficulty.json', 'major_diff'),
94
+ ('ps_yale_scores.json', 'ps_yale'),
95
+ ]:
96
+ fpath = os.path.join(DATA_DIR, fname)
97
+ if os.path.exists(fpath):
98
+ with open(fpath) as f:
99
+ llm_features_loaded[varname] = json.load(f)
100
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
101
+ else:
102
+ llm_features_loaded[varname] = {}
103
+
104
+ # Load raw data to get ED2 round info
105
+ import re
106
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
107
+ round_lookup = {}
108
+ if os.path.exists(RAW_CSV):
109
+ print(f"\n Loading raw CSV for ED2 round info...")
110
+ try:
111
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
112
+ dtype=str, chunksize=500)
113
+ for chunk in raw_chunks:
114
+ for _, row in chunk.iterrows():
115
+ sid = str(row.get('student_id', '')).replace('.0', '')
116
+ summary = str(row.get('school_results_summary', ''))
117
+ entries = re.split(r'(?=\d+\.)', summary)
118
+ for entry in entries:
119
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
120
+ if m:
121
+ round_type = m.group(1)
122
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
123
+ if school_m:
124
+ school_name = school_m.group(1).strip()
125
+ key = f"{sid}_{school_name}"
126
+ round_lookup[key] = round_type
127
+ print(f" Round lookup built: {len(round_lookup)} entries")
128
+ except Exception as e:
129
+ print(f" Warning: Could not load raw CSV: {e}")
130
+
131
+ # ============================================================
132
+ # 2. DATA CLEANING & QUALITY FIXES
133
+ # ============================================================
134
+ print(f"\n{'='*70}")
135
+ print(f" DATA QUALITY FIXES")
136
+ print(f"{'='*70}")
137
+
138
+ # 2a. Filter years
139
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy()
140
+ df = df.reset_index(drop=True)
141
+ print(f"After filtering 2018-2019: {df.shape}")
142
+
143
+ # 2b. FIX #1: SAT=0 -> NaN + has_sat
144
+ sat_zero = (df['sat'] == 0).sum()
145
+ df['has_sat'] = (df['sat'] > 0).astype(int)
146
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
147
+ print(f"\n FIX #1: SAT=0 -> NaN: {sat_zero} rows ({sat_zero/len(df)*100:.1f}%)")
148
+ print(f" has_sat=1: {df['has_sat'].sum()}, has_sat=0: {(df['has_sat']==0).sum()}")
149
+
150
+ # 2c. FIX #2: TOEFL=0 -> NaN + has_toefl
151
+ toefl_zero = (df['toefl'] == 0).sum()
152
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
153
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
154
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows ({toefl_zero/len(df)*100:.1f}%)")
155
+
156
+ # 2d. FIX #3: GPA=0 -> NaN (v5 already has has_gpa)
157
+ gpa_zero = (df['gpa'] == 0).sum()
158
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
159
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows ({gpa_zero/len(df)*100:.1f}%)")
160
+ if 'has_gpa' not in df.columns:
161
+ df['has_gpa'] = df['gpa'].notna().astype(int)
162
+ print(f" has_gpa=1: {(df['has_gpa']==1).sum()}, has_gpa=0: {(df['has_gpa']==0).sum()}")
163
+
164
+ # 2e. FIX #4: -1 -> NaN for sentinel columns
165
+ sentinel_cols = ['taste_yearly_admits_log']
166
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
167
+ if col in df.columns:
168
+ sentinel_cols.append(col)
169
+
170
+ for col in sentinel_cols:
171
+ if col in df.columns:
172
+ n_neg1 = (df[col] == -1).sum()
173
+ df.loc[df[col] == -1, col] = np.nan
174
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows ({n_neg1/len(df)*100:.1f}%)")
175
+
176
+ # 2f. FIX #5: has_ps=0 -> ps_bert all NaN
177
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
178
+ no_ps_mask = df['has_ps'] == 0
179
+ if ps_bert_cols:
180
+ n_fix = no_ps_mask.sum()
181
+ for col in ps_bert_cols:
182
+ df.loc[no_ps_mask, col] = np.nan
183
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {n_fix} rows, {len(ps_bert_cols)} columns")
184
+ else:
185
+ print(f" FIX #5: No ps_bert_pca columns found")
186
+
187
+ # 2f-v7. FIX #6 (NEW): has_ps=0 -> ALL ps2_* scores NaN
188
+ # Previously ps2 scores were broadcast to has_ps=0 rows (5057 polluted rows!)
189
+ ps2_score_cols = [c for c in df.columns if c.startswith('ps2_') and c != 'ps2_essay_type']
190
+ n_ps2_polluted = (no_ps_mask & df['ps2_mean'].notna()).sum()
191
+ for col in ps2_score_cols:
192
+ df.loc[no_ps_mask, col] = np.nan
193
+ print(f" FIX #6 (V7 NEW): ps2_* -> NaN for has_ps=0: {n_fix} rows, {len(ps2_score_cols)} cols")
194
+ print(f" Previously polluted ps2 rows: {n_ps2_polluted}")
195
+
196
+ # 2f-v7b. REMOVE ps2_is_cliche_topic (53.5% prevalence, no signal)
197
+ if 'ps2_is_cliche_topic' in df.columns:
198
+ df.drop(columns=['ps2_is_cliche_topic'], inplace=True)
199
+ print(f" FIX #6b (V7 NEW): Removed ps2_is_cliche_topic (53.5% prevalence, no signal)")
200
+
201
+ # 2f-v7c. ABLATION: Remove ps_bert_pca if flag is set
202
+ if ABLATE_PS_BERT and ps_bert_cols:
203
+ df.drop(columns=ps_bert_cols, inplace=True)
204
+ print(f" ABLATION: Removed {len(ps_bert_cols)} ps_bert_pca columns")
205
+
206
+ # Also set ps_word_count to NaN for has_ps=0 (it's already 0, but be explicit)
207
+ df.loc[no_ps_mask, 'ps_word_count'] = np.nan
208
+
209
+ # 2g. FIX portfolio_size: log transform + cap (from V2)
210
+ print(f"\n Portfolio size transform:")
211
+ print(f" Before: mean={df['portfolio_size'].mean():.1f}, max={df['portfolio_size'].max():.0f}")
212
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
213
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
214
+ print(f" After log(clip(x,20)): mean={df['portfolio_size'].mean():.2f}, max={df['portfolio_size'].max():.2f}")
215
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
216
+ bins=[0, 5, 10, 15, 20, 100],
217
+ labels=[0, 1, 2, 3, 4]).astype(int)
218
+
219
+ # 2h. ED2 split (from V2)
220
+ def get_detailed_round(row):
221
+ sid = str(row.get('student_id', '')).replace('.0', '')
222
+ school = str(row.get('school', ''))
223
+ key = f"{sid}_{school}"
224
+ raw_round = round_lookup.get(key, '')
225
+ if 'Early Decision II' in raw_round:
226
+ return 'ED2'
227
+ elif 'Early Decision' in raw_round:
228
+ return 'ED1'
229
+ elif 'Restrictive Early Action' in raw_round:
230
+ return 'REA'
231
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round:
232
+ return 'EA'
233
+ elif 'Regular Decision' in raw_round:
234
+ return 'RD'
235
+ orig = str(row.get('round_cat', 'RD'))
236
+ if orig == 'ED':
237
+ return 'ED1'
238
+ return orig
239
+
240
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
241
+ print(f"\n Round distribution (v2):")
242
+ print(df['round_cat_v2'].value_counts().to_string())
243
+
244
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
245
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
246
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
247
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
248
+ df['round_cat'] = df['round_cat_v2']
249
+
250
+ # ============================================================
251
+ # 3. PARSE LLM FEATURES
252
+ # ============================================================
253
+ act_scores = {}
254
+ raw = llm_features_loaded.get('act_scores', {})
255
+ if isinstance(raw, list):
256
+ for item in raw:
257
+ if isinstance(item, dict) and item.get('success', False):
258
+ sid_raw = str(item.get('student_id', ''))
259
+ act_scores[sid_raw] = item
260
+ parts = sid_raw.split('_')
261
+ for p in parts:
262
+ clean = p.replace('.0', '')
263
+ if clean.isdigit():
264
+ act_scores[clean] = item
265
+ elif isinstance(raw, dict):
266
+ for sid, scores in raw.items():
267
+ if isinstance(scores, dict):
268
+ act_scores[sid] = scores
269
+
270
+ supp_scores = {}
271
+ raw = llm_features_loaded.get('supp_scores', {})
272
+ if isinstance(raw, list):
273
+ for item in raw:
274
+ if isinstance(item, dict) and item.get('success', False):
275
+ sid = str(item.get('student_id', '')).replace('.0', '')
276
+ school = str(item.get('school', ''))
277
+ key = f"{sid}_{school}"
278
+ oq = item.get('overall_quality', 0)
279
+ if isinstance(oq, (int, float)) and oq <= 1:
280
+ continue
281
+ supp_scores[key] = item
282
+ elif isinstance(raw, dict):
283
+ for key, scores in raw.items():
284
+ if isinstance(scores, dict):
285
+ oq = scores.get('overall_quality', 0)
286
+ if isinstance(oq, (int, float)) and oq <= 1:
287
+ continue
288
+ supp_scores[key] = scores
289
+ print(f" Supp scores after filtering score=1: {len(supp_scores)} valid entries")
290
+
291
+ major_diff = llm_features_loaded.get('major_diff', {})
292
+ if isinstance(major_diff, list):
293
+ major_diff = {}
294
+
295
+ ps_yale = {}
296
+ raw = llm_features_loaded.get('ps_yale', {})
297
+ if isinstance(raw, list):
298
+ for item in raw:
299
+ if isinstance(item, dict):
300
+ sid = str(item.get('student_id', '')).replace('.0', '')
301
+ ps_yale[sid] = item
302
+ elif isinstance(raw, dict):
303
+ ps_yale = raw
304
+
305
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
306
+
307
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
308
+ 'n_founder', 'n_president', 'max_scope',
309
+ 'has_publication', 'has_patent', 'has_summer_program',
310
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
311
+ 'activity_coherence', 'spike_strength']
312
+
313
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
314
+ 'has_imagination_scene', 'mentions_specific_course',
315
+ 'mentions_specific_professor', 'mentions_specific_program',
316
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
317
+
318
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
319
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
320
+ and not k.startswith('is_')]
321
+ if not PS_DIMS:
322
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
323
+ 'coherence_focus', 'overall_effectiveness']
324
+
325
+ # ============================================================
326
+ # 4. DEFINE FEATURE GROUPS
327
+ # ============================================================
328
+ STUDENT_LEVEL_NUMERIC = [
329
+ 'toefl', 'sat', 'gpa',
330
+ 'act_total_count', 'act_type_diversity',
331
+ *[f'act_slot_pca_{i}' for i in range(20)],
332
+ *[f'act_bert_pca_{i}' for i in range(16)],
333
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
334
+ 'honors_count', 'honors_total_score',
335
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
336
+ 'honors_has_national',
337
+ 'honors_quality_ratio',
338
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
339
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
340
+ 'cuilu_hs_total',
341
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
342
+ 'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
343
+ 'hs_overall_hist_rate',
344
+ 'summer_max_geili', 'summer_has_elite', 'summer_count',
345
+ 'summer_program_count', 'summer_difficulty_max',
346
+ # PS V2 scores (ps2_is_cliche_topic REMOVED in V7)
347
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice', 'ps2_overall', 'ps2_mean',
348
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
349
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
350
+ ]
351
+
352
+ # Conditionally include ps_bert_pca (for ablation)
353
+ if not ABLATE_PS_BERT:
354
+ STUDENT_LEVEL_NUMERIC.extend([f'ps_bert_pca_{i}' for i in range(16)])
355
+
356
+ # Identify PS-related features for special school_mean handling
357
+ PS_RELATED_FEATURES = set([
358
+ *[f'ps_bert_pca_{i}' for i in range(16)],
359
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice',
360
+ 'ps2_overall', 'ps2_mean',
361
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
362
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
363
+ 'ps_word_count',
364
+ ])
365
+
366
+ # Add act_type_count columns dynamically
367
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
368
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
369
+
370
+ # Filter to only existing columns
371
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
372
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
373
+
374
+ KEY_STUDENT_FEATURES = [
375
+ 'toefl', 'sat', 'gpa',
376
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
377
+ 'honors_quality_ratio',
378
+ 'act_type_diversity', 'act_total_count',
379
+ 'hs_to_univ_hist_rate_smoothed',
380
+ 'summer_max_geili',
381
+ # PS V2 scores
382
+ 'ps2_overall', 'ps2_character_revelation', 'ps2_craft_voice',
383
+ ]
384
+
385
+ LLM_INTERACTION_FEATURES = [
386
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
387
+ 'supp_mean', 'supp_max', 'ps_mean',
388
+ 'major_difficulty',
389
+ # PS V2 scores
390
+ 'ps2_mean', 'ps2_overall',
391
+ ]
392
+
393
+ # ============================================================
394
+ # 5. BUILD FEATURES
395
+ # ============================================================
396
+ def build_features_base(df):
397
+ """Build base features WITHOUT residualization."""
398
+ df = df.copy()
399
+
400
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
401
+ df['year_cat'] = df['year'].astype(str)
402
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
403
+
404
+ # LLM Activity features
405
+ for dim in ACT_DIMS:
406
+ col_name = f'llm_act_{dim}'
407
+ df[col_name] = df['sid_str'].map(
408
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
409
+
410
+ # LLM Supp features
411
+ def get_supp_score(row, dim):
412
+ key = f"{row['sid_str']}_{row['school']}"
413
+ return safe_num(supp_scores.get(key, {}).get(dim, np.nan))
414
+ for dim in SUPP_DIMS:
415
+ col_name = f'supp_{dim}'
416
+ df[col_name] = df.apply(lambda r, d=dim: get_supp_score(r, d), axis=1)
417
+
418
+ # Major difficulty
419
+ def get_major_diff(row):
420
+ key = f"{row['school']}_{row['major_cat']}"
421
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
422
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
423
+
424
+ # PS Yale scores
425
+ for dim in PS_DIMS:
426
+ col_name = f'ps_{dim}'
427
+ df[col_name] = df['sid_str'].map(
428
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
429
+
430
+ # Aggregates
431
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
432
+ valid_act = df[llm_act_cols]
433
+ df['llm_act_mean'] = valid_act.mean(axis=1)
434
+ df['llm_act_max'] = valid_act.max(axis=1)
435
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
436
+
437
+ supp_num_cols = [f'supp_{d}' for d in SUPP_DIMS if d not in ['has_red_flag']]
438
+ valid_supp = df[supp_num_cols]
439
+ df['supp_mean'] = valid_supp.mean(axis=1)
440
+ df['supp_max'] = valid_supp.max(axis=1)
441
+
442
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
443
+ valid_ps = df[ps_cols]
444
+ df['ps_mean'] = valid_ps.mean(axis=1)
445
+
446
+ # Basic interactions
447
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
448
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
449
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
450
+
451
+ if 'honors_avg_score' in df.columns:
452
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
453
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
454
+
455
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
456
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
457
+
458
+ # Categoricals
459
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
460
+ cat_cols = [c for c in cat_cols if c in df.columns]
461
+
462
+ if 'round_cat' in df.columns:
463
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
464
+ cat_cols.append('school_round')
465
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
466
+ cat_cols.append('school_major')
467
+ if 'hs_cat' in df.columns:
468
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
469
+ cat_cols.append('school_hstype')
470
+
471
+ for c in cat_cols:
472
+ df[c] = df[c].fillna('_MISSING_').astype(str)
473
+ le = LabelEncoder()
474
+ df[c] = le.fit_transform(df[c]).astype(int)
475
+
476
+ return df, cat_cols
477
+
478
+
479
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
480
+ """Add residualized + interaction + ED boost features using ONLY training data statistics.
481
+ V7 KEY FIX: For PS-related features, school_mean uses ONLY has_ps=1 rows."""
482
+ df = df.copy()
483
+
484
+ # Step 1: Bayesian-smoothed school_base_rate
485
+ train_df = df[train_mask]
486
+ global_rate = train_df[TARGET].mean()
487
+
488
+ school_stats = train_df.groupby('school').agg(
489
+ school_raw_rate=(TARGET, 'mean'),
490
+ school_n_apps=(TARGET, 'count'),
491
+ school_n_admits=(TARGET, 'sum'),
492
+ ).reset_index()
493
+
494
+ SMOOTH_STRENGTH = 30
495
+ school_stats['school_base_rate'] = (
496
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
497
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
498
+ )
499
+
500
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
501
+ on='school', how='left')
502
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
503
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
504
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
505
+
506
+ # Step 1b: ED boost per school
507
+ ed1_mask = train_df['is_ed1'] == 1
508
+ rd_mask = train_df['is_early'] == 0
509
+
510
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
511
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
512
+
513
+ ed_boost_map = {}
514
+ for school in ed1_school_rates.index:
515
+ if school in rd_school_rates.index:
516
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
517
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
518
+
519
+ ed2_mask = train_df['is_ed2'] == 1
520
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
521
+ ed2_boost_map = {}
522
+ for school in ed2_school_rates.index:
523
+ if school in rd_school_rates.index:
524
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
525
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
526
+
527
+ # Step 2: Residualize student features
528
+ # V7 KEY FIX: For PS-related features, compute school_mean using ONLY has_ps=1 training rows
529
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
530
+
531
+ # Pre-compute the has_ps=1 training subset for PS features
532
+ train_has_ps = train_df[train_df['has_ps'] == 1]
533
+
534
+ resid_cols = []
535
+ for col in student_feat_available:
536
+ resid_col = f'{col}_resid'
537
+
538
+ # V7 FIX #7: Use has_ps=1 subset for PS-related features
539
+ if col in PS_RELATED_FEATURES:
540
+ school_mean_series = train_has_ps.groupby('school')[col].mean()
541
+ else:
542
+ school_mean_series = train_df.groupby('school')[col].mean()
543
+
544
+ col_school_mean = df['school'].map(school_mean_series)
545
+ df[resid_col] = df[col] - col_school_mean
546
+ resid_cols.append(resid_col)
547
+
548
+ # Step 2b (V7 NEW): ps2_mean_school_pctile - continuous within-school percentile
549
+ # This solves the granularity problem: ps2_mean has only 17 unique values,
550
+ # but within each school the percentile is continuous
551
+ pctile_ps_cols = []
552
+ if 'ps2_mean' in df.columns:
553
+ ps_pctile_col = 'ps2_mean_school_pctile'
554
+ # Use ONLY has_ps=1 training rows for school distributions
555
+ school_ps_distributions = {}
556
+ for school_id in train_has_ps['school'].unique():
557
+ vals = train_has_ps[train_has_ps['school'] == school_id]['ps2_mean'].dropna().values
558
+ if len(vals) > 2:
559
+ school_ps_distributions[school_id] = vals
560
+
561
+ def compute_ps_pctile(row, sd=school_ps_distributions):
562
+ school_id = row['school']
563
+ val = row['ps2_mean']
564
+ if pd.isna(val) or school_id not in sd:
565
+ return np.nan
566
+ dist = sd[school_id]
567
+ return np.mean(dist <= val)
568
+
569
+ df[ps_pctile_col] = df.apply(compute_ps_pctile, axis=1)
570
+ pctile_ps_cols.append(ps_pctile_col)
571
+
572
+ n_valid = df[ps_pctile_col].notna().sum()
573
+ n_unique = df[ps_pctile_col].nunique()
574
+ print(f" V7 NEW: {ps_pctile_col}: {n_valid} valid, {n_unique} unique values")
575
+
576
+ # Step 3: Explicit interactions (student feature x school_base_rate)
577
+ interaction_cols = []
578
+ for col in KEY_STUDENT_FEATURES:
579
+ if col in df.columns:
580
+ int_col = f'{col}_x_school_rate'
581
+ df[int_col] = df[col] * df['school_base_rate']
582
+ interaction_cols.append(int_col)
583
+
584
+ resid_col = f'{col}_resid'
585
+ if resid_col in df.columns:
586
+ int_resid_col = f'{col}_resid_x_rate'
587
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
588
+ interaction_cols.append(int_resid_col)
589
+
590
+ # Step 3b: LLM feature x school_base_rate interactions
591
+ for col in LLM_INTERACTION_FEATURES:
592
+ if col in df.columns:
593
+ int_col = f'{col}_x_school_rate'
594
+ df[int_col] = df[col] * df['school_base_rate']
595
+ interaction_cols.append(int_col)
596
+
597
+ # Step 3c: portfolio_size x school_base_rate interaction
598
+ if 'portfolio_size' in df.columns:
599
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
600
+ interaction_cols.append('portfolio_x_school_rate')
601
+
602
+ # Step 3d: ED flag x school_ed_boost interaction
603
+ if 'is_ed1' in df.columns:
604
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
605
+ interaction_cols.append('ed1_x_ed_boost')
606
+ if 'is_ed2' in df.columns:
607
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
608
+ interaction_cols.append('ed2_x_ed2_boost')
609
+
610
+ # Step 3e: has_sat/has_toefl/has_gpa interactions with school_base_rate
611
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
612
+ if flag in df.columns:
613
+ int_col = f'{flag}_x_school_rate'
614
+ df[int_col] = df[flag] * df['school_base_rate']
615
+ interaction_cols.append(int_col)
616
+
617
+ # Step 3f (V7 NEW): ps2_mean_school_pctile x school_base_rate
618
+ if 'ps2_mean_school_pctile' in df.columns:
619
+ df['ps2_pctile_x_school_rate'] = df['ps2_mean_school_pctile'] * df['school_base_rate']
620
+ interaction_cols.append('ps2_pctile_x_school_rate')
621
+
622
+ # Step 4: Student percentile within school (NaN-safe)
623
+ pctile_cols = []
624
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score',
625
+ 'llm_act_mean', 'supp_mean']:
626
+ if col not in df.columns:
627
+ continue
628
+ pctile_col = f'{col}_school_pctile'
629
+ school_distributions = {}
630
+ for school_id in train_df['school'].unique():
631
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
632
+ if len(vals) > 2:
633
+ school_distributions[school_id] = vals
634
+
635
+ def compute_pctile(row, col=col, sd=school_distributions):
636
+ school_id = row['school']
637
+ val = row[col]
638
+ if pd.isna(val) or school_id not in sd:
639
+ return np.nan
640
+ dist = sd[school_id]
641
+ return np.mean(dist <= val)
642
+
643
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
644
+ pctile_cols.append(pctile_col)
645
+
646
+ # Merge ps2 pctile into pctile_cols for reporting
647
+ pctile_cols.extend(pctile_ps_cols)
648
+
649
+ # Step 5: Student competitiveness score (NaN-safe)
650
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
651
+ components = []
652
+ weights = []
653
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
654
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
655
+ if col in df.columns:
656
+ components.append(df[col] / scale)
657
+ weights.append(w)
658
+ if components:
659
+ strength_df = pd.DataFrame(components).T
660
+ df['student_strength'] = strength_df.mean(axis=1)
661
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
662
+
663
+ # Build final feature list
664
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
665
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
666
+
667
+ all_feat = list(set(num_cols + cat_cols))
668
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
669
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
670
+ if remove in feature_cols:
671
+ feature_cols.remove(remove)
672
+
673
+ # Remove constant columns
674
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
675
+ feature_cols = [c for c in feature_cols if c not in to_drop]
676
+
677
+ # Apply feature selection if provided
678
+ if selected_features is not None:
679
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
680
+ 'student_strength', 'strength_vs_school',
681
+ 'school_ed_boost', 'school_ed2_boost',
682
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
683
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
684
+ 'has_sat', 'has_toefl', 'has_gpa',
685
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate',
686
+ # V7: always keep new PS features
687
+ 'ps2_mean_school_pctile', 'ps2_pctile_x_school_rate'}
688
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
689
+
690
+ # Handle inf
691
+ for c in feature_cols:
692
+ if df[c].dtype in ['float64', 'float32']:
693
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
694
+
695
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
696
+
697
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + 5
698
+ print(f" Resid features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile = {new_feat_count} new, total={len(feature_cols)}")
699
+
700
+ return df, feature_cols, cat_cols, cat_indices
701
+
702
+
703
+ # ============================================================
704
+ # 6. BUILD BASE FEATURES
705
+ # ============================================================
706
+ df_base, cat_cols = build_features_base(df)
707
+ print(f"\nBase features built. Shape: {df_base.shape}")
708
+
709
+ # Quick NaN summary
710
+ print(f"\n NaN summary after fixes:")
711
+ for col in ['sat', 'toefl', 'gpa', 'ps2_mean', 'ps2_overall']:
712
+ if col in df_base.columns:
713
+ nan_pct = df_base[col].isna().mean() * 100
714
+ print(f" {col}: {nan_pct:.1f}% NaN")
715
+
716
+ # V7: Verify ps2 cleanup
717
+ no_ps_check = df_base[df_base['has_ps'] == 0]
718
+ if 'ps2_mean' in df_base.columns:
719
+ ps2_polluted = no_ps_check['ps2_mean'].notna().sum()
720
+ print(f"\n V7 VERIFY: ps2_mean non-NaN for has_ps=0: {ps2_polluted} (should be 0)")
721
+
722
+ y = df_base[TARGET].values
723
+ groups = df_base['student_id'].values
724
+
725
+ # ============================================================
726
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
727
+ # ============================================================
728
+ print(f"\n{'='*70}")
729
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
730
+ print(f"{'='*70}")
731
+
732
+ stage1_fi = []
733
+ gkf_s1 = GroupKFold(n_splits=5)
734
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
735
+ train_mask = pd.Series(False, index=df_base.index)
736
+ train_mask.iloc[tr_idx] = True
737
+
738
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
739
+ df_base, train_mask, cat_cols)
740
+
741
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
742
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
743
+ y_tr = y[tr_idx]
744
+ y_va = y[va_idx]
745
+
746
+ for c in cat_cols_f:
747
+ if c in X_tr.columns:
748
+ X_tr[c] = X_tr[c].astype(int)
749
+ X_va[c] = X_va[c].astype(int)
750
+
751
+ cb = CatBoostClassifier(
752
+ iterations=500, depth=6, learning_rate=0.05,
753
+ l2_leaf_reg=7, random_seed=42, verbose=0,
754
+ cat_features=cat_idx_f, eval_metric='AUC',
755
+ early_stopping_rounds=50)
756
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
757
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
758
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
759
+
760
+ fi = cb.get_feature_importance()
761
+ stage1_fi.append(fi)
762
+
763
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
764
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
765
+
766
+ if fold == 0:
767
+ all_feature_names = feat_cols_f
768
+
769
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
770
+
771
+ # Select top features
772
+ avg_fi = np.mean(stage1_fi, axis=0)
773
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
774
+
775
+ selected_set = set(cat_cols)
776
+ n_added = 0
777
+ for fname, imp in fi_pairs:
778
+ if fname not in cat_cols:
779
+ selected_set.add(fname)
780
+ n_added += 1
781
+ if n_added >= FEATURE_SELECT_TOP_N:
782
+ break
783
+
784
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
785
+ print(f" Top 30 features:")
786
+ for i, (fname, imp) in enumerate(fi_pairs[:30]):
787
+ marker = ""
788
+ if '_resid' in fname: marker = " [R]"
789
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
790
+ elif '_school_pctile' in fname: marker = " [P]"
791
+ elif 'school_base_rate' in fname: marker = " [S]"
792
+ elif 'ed_boost' in fname: marker = " [ED]"
793
+ elif 'ps2_' in fname: marker = " [PS2]"
794
+ print(f" {i+1:3d}. {fname:<50s} {imp:>8.2f}{marker}")
795
+
796
+ # ============================================================
797
+ # 8. TEMPORAL VALIDATION WITH SELECTED FEATURES
798
+ # ============================================================
799
+ print(f"\n{'='*70}")
800
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024) WITH FEATURE SELECTION")
801
+ print(f"{'='*70}")
802
+
803
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
804
+ mask_test_temporal = df_base['year'] == 2024
805
+
806
+ temporal_results = {}
807
+ if mask_test_temporal.sum() > 0:
808
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
809
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
810
+
811
+ X_t = df_temporal[feat_cols_t].copy()
812
+ for c in cat_cols_t:
813
+ if c in X_t.columns:
814
+ X_t[c] = X_t[c].astype(int)
815
+
816
+ X_tr_t = X_t[mask_train_temporal]
817
+ X_te_t = X_t[mask_test_temporal]
818
+ y_tr_t = y[mask_train_temporal]
819
+ y_te_t = y[mask_test_temporal]
820
+
821
+ X_tr_t_filled = X_tr_t.fillna(-999)
822
+ X_te_t_filled = X_te_t.fillna(-999)
823
+
824
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
825
+
826
+ for seed in SEEDS:
827
+ cb_t = CatBoostClassifier(
828
+ iterations=1000, depth=6, learning_rate=0.03,
829
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
830
+ cat_features=cat_idx_t, eval_metric='AUC',
831
+ early_stopping_rounds=100, min_data_in_leaf=10)
832
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
833
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
834
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
835
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
836
+ del cb_t; gc.collect()
837
+
838
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
839
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
840
+ lgb_params = {
841
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
842
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
843
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
844
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
845
+ 'seed': seed
846
+ }
847
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
848
+ valid_sets=[lgb_va],
849
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
850
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
851
+ del lgb_model; gc.collect()
852
+
853
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
854
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
855
+ xgb_params = {
856
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
857
+ 'max_depth': 6, 'learning_rate': 0.03,
858
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
859
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
860
+ 'min_child_weight': 5,
861
+ 'seed': seed, 'verbosity': 0
862
+ }
863
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
864
+ evals=[(dtest, 'val')],
865
+ early_stopping_rounds=100, verbose_eval=False)
866
+ xgb_pred = xgb_model.predict(dtest)
867
+ del xgb_model, dtrain, dtest; gc.collect()
868
+
869
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
870
+ temporal_results[seed] = {
871
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
872
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
873
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
874
+ 'blend': float(roc_auc_score(y_te_t, blend))
875
+ }
876
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
877
+
878
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
879
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f}")
880
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
881
+ print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
882
+ print(f" Delta vs V38.2-PRO-V6: {avg_temporal - 0.8543:+.4f}")
883
+
884
+ del df_temporal, X_t; gc.collect()
885
+ else:
886
+ avg_temporal = 0.0
887
+
888
+ # ============================================================
889
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
890
+ # ============================================================
891
+ print(f"\n{'='*70}")
892
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
893
+ print(f"{'='*70}")
894
+
895
+ all_cb_oof = []
896
+ all_lgb_oof = []
897
+ all_xgb_oof = []
898
+ all_fi = []
899
+ feature_cols_final = None
900
+
901
+ for seed_idx, seed in enumerate(SEEDS):
902
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
903
+ gkf = GroupKFold(n_splits=N_FOLDS)
904
+ cb_oof = np.zeros(len(df_base))
905
+ lgb_oof = np.zeros(len(df_base))
906
+ xgb_oof = np.zeros(len(df_base))
907
+
908
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
909
+ train_mask = pd.Series(False, index=df_base.index)
910
+ train_mask.iloc[tr_idx] = True
911
+
912
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
913
+ df_base, train_mask, cat_cols, selected_features=selected_set)
914
+
915
+ if feature_cols_final is None:
916
+ feature_cols_final = feat_cols_f
917
+ print(f" Total features after selection: {len(feat_cols_f)}")
918
+
919
+ X_fold = df_fold[feat_cols_f].copy()
920
+ for c in cat_cols_f:
921
+ if c in X_fold.columns:
922
+ X_fold[c] = X_fold[c].astype(int)
923
+
924
+ X_tr_df = X_fold.iloc[tr_idx]
925
+ X_va_df = X_fold.iloc[va_idx]
926
+ y_tr = y[tr_idx]
927
+ y_va = y[va_idx]
928
+
929
+ # CatBoost: native NaN
930
+ cb = CatBoostClassifier(
931
+ iterations=1500, depth=6, learning_rate=0.03,
932
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
933
+ cat_features=cat_idx_f, eval_metric='AUC',
934
+ early_stopping_rounds=100, min_data_in_leaf=10)
935
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
936
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
937
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
938
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
939
+ cb_oof[va_idx] = cb_pred
940
+
941
+ if fold == N_FOLDS - 1:
942
+ all_fi.append(cb.get_feature_importance())
943
+ del cb, pool_tr, pool_va; gc.collect()
944
+
945
+ # LGB/XGB: fill NaN
946
+ X_tr_filled = X_tr_df.fillna(-999).values
947
+ X_va_filled = X_va_df.fillna(-999).values
948
+
949
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
950
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
951
+ lgb_params = {
952
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
953
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
954
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
955
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
956
+ 'seed': seed
957
+ }
958
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
959
+ valid_sets=[lgb_va_ds],
960
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
961
+ lgb_pred = lgb_model.predict(X_va_filled)
962
+ lgb_oof[va_idx] = lgb_pred
963
+ del lgb_model; gc.collect()
964
+
965
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
966
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
967
+ xgb_params = {
968
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
969
+ 'max_depth': 6, 'learning_rate': 0.03,
970
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
971
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
972
+ 'min_child_weight': 5,
973
+ 'seed': seed, 'verbosity': 0
974
+ }
975
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
976
+ evals=[(dval, 'val')],
977
+ early_stopping_rounds=100, verbose_eval=False)
978
+ xgb_pred = xgb_model.predict(dval)
979
+ xgb_oof[va_idx] = xgb_pred
980
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
981
+
982
+ if (fold + 1) % 5 == 0:
983
+ print(f" Fold {fold+1}/{N_FOLDS} done")
984
+
985
+ cb_auc = roc_auc_score(y, cb_oof)
986
+ lgb_auc = roc_auc_score(y, lgb_oof)
987
+ xgb_auc = roc_auc_score(y, xgb_oof)
988
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
989
+
990
+ all_cb_oof.append(cb_oof)
991
+ all_lgb_oof.append(lgb_oof)
992
+ all_xgb_oof.append(xgb_oof)
993
+
994
+ # ============================================================
995
+ # 10. ENSEMBLE & BLEND
996
+ # ============================================================
997
+ print(f"\n{'='*70}")
998
+ print(f" ENSEMBLE RESULTS")
999
+ print(f"{'='*70}")
1000
+
1001
+ cb_avg = np.mean(all_cb_oof, axis=0)
1002
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
1003
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
1004
+
1005
+ cb_final_auc = roc_auc_score(y, cb_avg)
1006
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
1007
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
1008
+
1009
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
1010
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
1011
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
1012
+
1013
+ best_auc = 0
1014
+ best_weights = (0.45, 0.20, 0.35)
1015
+ for w_cb in np.arange(0.2, 0.7, 0.05):
1016
+ for w_lgb in np.arange(0.05, 0.5, 0.05):
1017
+ w_xgb = 1.0 - w_cb - w_lgb
1018
+ if w_xgb < 0.05: continue
1019
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
1020
+ auc = roc_auc_score(y, blend)
1021
+ if auc > best_auc:
1022
+ best_auc = auc
1023
+ best_weights = (w_cb, w_lgb, w_xgb)
1024
+
1025
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
1026
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
1027
+ print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
1028
+ print(f" Delta vs V38.2-PRO-V6: {best_auc - 0.8760:+.4f}")
1029
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
1030
+
1031
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
1032
+ rank_auc = roc_auc_score(y, rank_blend)
1033
+ print(f" Rank blend: {rank_auc:.4f}")
1034
+
1035
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
1036
+ final_auc = roc_auc_score(y, final_blend_prob)
1037
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1038
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1039
+
1040
+ print(f"\n FINAL METRICS:")
1041
+ print(f" AUC: {final_auc:.4f} (V38.2-PRO-V4: 0.8758, V38.2-PRO-V6: 0.8760)")
1042
+ print(f" Brier: {final_brier:.4f}")
1043
+ print(f" LogLoss: {final_logloss:.4f}")
1044
+
1045
+ # ============================================================
1046
+ # 11. FEATURE IMPORTANCE
1047
+ # ============================================================
1048
+ print(f"\n{'='*70}")
1049
+ print(f" FEATURE IMPORTANCE (avg across seeds)")
1050
+ print(f"{'='*70}")
1051
+
1052
+ if feature_cols_final and all_fi:
1053
+ avg_fi = np.mean(all_fi, axis=0)
1054
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
1055
+
1056
+ print(f" {'Rank':<5s} {'Feature':<50s} {'Importance':>10s}")
1057
+ print(f" {'-'*5} {'-'*50} {'-'*10}")
1058
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
1059
+ marker = ""
1060
+ if '_resid' in fname: marker = " [RESID]"
1061
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [INTERACT]"
1062
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
1063
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
1064
+ elif 'ed_boost' in fname or 'ed2_boost' in fname: marker = " [ED_BOOST]"
1065
+ elif fname.startswith('has_'): marker = " [FLAG]"
1066
+ elif 'ps2_' in fname: marker = " [PS2_V7]"
1067
+ print(f" {i+1:<5d} {fname:<50s} {imp:>10.2f}{marker}")
1068
+
1069
+ # Count PS-related features in top 30
1070
+ ps_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'ps2_' in f or 'ps_bert' in f or 'ps_mean' in f)
1071
+ print(f"\n PS-related features in top 30: {ps_in_top30}")
1072
+
1073
+ # ============================================================
1074
+ # 12. SAVE RESULTS
1075
+ # ============================================================
1076
+ elapsed = time.time() - start_time
1077
+
1078
+ results = {
1079
+ 'version': 'V38.2-pro-v7',
1080
+ 'ablation': {'ABLATE_PS_BERT': ABLATE_PS_BERT},
1081
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1082
+ 'elapsed_minutes': elapsed / 60,
1083
+ 'changes': [
1084
+ 'FIX #6: has_ps=0 -> ALL ps2_* scores NaN (was 5057 polluted rows)',
1085
+ 'FIX #7: Residualization school_mean for PS features uses ONLY has_ps=1 rows',
1086
+ 'NEW: ps2_mean_school_pctile (continuous within-school percentile)',
1087
+ 'REMOVE: ps2_is_cliche_topic (53.5% prevalence, no signal)',
1088
+ f'ABLATION: ABLATE_PS_BERT={ABLATE_PS_BERT}',
1089
+ 'All V6 fixes carried forward',
1090
+ ],
1091
+ 'comparison': {
1092
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
1093
+ 'v38_2_pro_v4': {'auc': 0.8758, 'temporal_auc': 0.8555},
1094
+ 'v38_2_pro_v6': {'auc': 0.8760, 'temporal_auc': 0.8543},
1095
+ },
1096
+ 'temporal_validation': {
1097
+ 'per_seed': temporal_results,
1098
+ 'avg_blend': float(avg_temporal),
1099
+ },
1100
+ 'groupkfold': {
1101
+ 'best_3model_blend': float(best_auc),
1102
+ 'best_weights': [float(w) for w in best_weights],
1103
+ 'rank_blend': float(rank_auc),
1104
+ },
1105
+ 'final_metrics': {
1106
+ 'auc': float(final_auc),
1107
+ 'brier': float(final_brier),
1108
+ 'logloss': float(final_logloss),
1109
+ },
1110
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1111
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1112
+ }
1113
+
1114
+ suffix = '_ablate_ps_bert' if ABLATE_PS_BERT else ''
1115
+ with open(os.path.join(OUTPUT_DIR, f'v38_2_pro_v7{suffix}_results.json'), 'w') as f:
1116
+ json.dump(results, f, indent=2)
1117
+
1118
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1119
+ oof_df['cb_pred'] = cb_avg
1120
+ oof_df['lgb_pred'] = lgb_avg
1121
+ oof_df['xgb_pred'] = xgb_avg
1122
+ oof_df['final_pred'] = final_blend_prob
1123
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, f'v38_2_pro_v7{suffix}_oof_predictions.csv'), index=False)
1124
+
1125
+ print(f"\n{'='*70}")
1126
+ print(f" V38.2-PRO-V7 COMPLETE (ABLATE_PS_BERT={ABLATE_PS_BERT})")
1127
+ print(f" Total time: {elapsed/60:.1f} minutes")
1128
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1129
+ print(f" GroupKFold AUC: {final_auc:.4f} (V38.2-PRO-V4: 0.8758, V38.2-PRO-V6: 0.8760)")
1130
+ print(f" Temporal AUC: {avg_temporal:.4f} (V38.2-PRO-V4: 0.8555, V38.2-PRO-V6: 0.8543)")
1131
+ print(f"{'='*70}")