catninja123 commited on
Commit
2005ced
·
verified ·
1 Parent(s): 4875f0d

Upload train_v38_2_pro_v5.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v5.py +1076 -0
train_v38_2_pro_v5.py ADDED
@@ -0,0 +1,1076 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V5 MODEL - Admitted Similarity + PS Grok Scores + v7 Matrix
4
+ ====================================================================
5
+ Changes from V38.2-PRO-V4:
6
+ 1. Use v7 feature matrix (v6 + admitted similarity + PS red flags)
7
+ 2. New: act_bert_admit_sim, ps_bert_admit_sim, act_slot_admit_sim (time-respecting)
8
+ 3. New: act_bert_sim_contrast, ps_bert_sim_contrast (admit - reject similarity)
9
+ 4. New: Grok 4.2 PS scores (originality, reflection, authenticity, overall)
10
+ 5. New: PS red flags (is_templated, has_grammar_issues, etc.)
11
+ 6. Added admitted_sim features to KEY_STUDENT_FEATURES for residualization
12
+ 7. All V4 fixes carried forward
13
+ ====================================================================
14
+ """
15
+ import pandas as pd
16
+ import numpy as np
17
+ import json, os, warnings, sys, time, pickle, gc
18
+ warnings.filterwarnings('ignore')
19
+ from sklearn.model_selection import GroupKFold
20
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
21
+ from sklearn.preprocessing import LabelEncoder
22
+ from scipy.stats import rankdata
23
+
24
+ try:
25
+ from catboost import CatBoostClassifier, Pool
26
+ import lightgbm as lgb
27
+ import xgboost as xgb
28
+ print("All model libraries loaded successfully")
29
+ except ImportError as e:
30
+ print(f"Missing library: {e}")
31
+ import subprocess
32
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
33
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
34
+ from catboost import CatBoostClassifier, Pool
35
+ import lightgbm as lgb
36
+ import xgboost as xgb
37
+
38
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
39
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
40
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
41
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
42
+
43
+ TARGET = 'target'
44
+ SEEDS = [42, 123, 456, 789, 2024]
45
+ N_FOLDS = 10
46
+ FEATURE_SELECT_TOP_N = 150
47
+ start_time = time.time()
48
+
49
+ def safe_num(v, default=np.nan):
50
+ """Convert to float, return NaN for missing (was -1 before)."""
51
+ if isinstance(v, (int, float)):
52
+ val = float(v)
53
+ return np.nan if val == -1 else val
54
+ if isinstance(v, str):
55
+ try:
56
+ val = float(v)
57
+ return np.nan if val == -1 else val
58
+ except:
59
+ return default
60
+ return default
61
+
62
+ # ============================================================
63
+ # 1. LOAD DATA (v6 feature matrix)
64
+ # ============================================================
65
+ print("=" * 70)
66
+ print(" V38.2-PRO-V5: ADMITTED SIMILARITY + PS GROK SCORES + V7 MATRIX")
67
+ print("=" * 70)
68
+
69
+ # Try v6 first, fall back to v5, then v4
70
+ v7_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v7.csv')
71
+ v6_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v6.csv')
72
+ v5_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v5.csv')
73
+ v4_path = os.path.join(DATA_DIR, 'v38_2_integrated_features.csv')
74
+ if os.path.exists(v7_path):
75
+ df_raw = pd.read_csv(v7_path)
76
+ print(f"V7 features loaded: {df_raw.shape}")
77
+ elif os.path.exists(v6_path):
78
+ df_raw = pd.read_csv(v6_path)
79
+ print(f"V6 features loaded (v7 not found): {df_raw.shape}")
80
+ elif os.path.exists(v5_path):
81
+ df_raw = pd.read_csv(v5_path)
82
+ print(f"V5 features loaded: {df_raw.shape}")
83
+ else:
84
+ df_raw = pd.read_csv(v4_path)
85
+ print(f"V4 features loaded: {df_raw.shape}")
86
+
87
+ # Load LLM features
88
+ llm_features_loaded = {}
89
+ for fname, varname in [
90
+ ('llm_activity_scores.json', 'act_scores'),
91
+ ('llm_supp_quality_all.json', 'supp_scores'),
92
+ ('llm_major_difficulty.json', 'major_diff'),
93
+ ('ps_yale_scores.json', 'ps_yale'),
94
+ ('grok42_ps_scores.json', 'grok_ps_scores'),
95
+ ]:
96
+ fpath = os.path.join(DATA_DIR, fname)
97
+ if os.path.exists(fpath):
98
+ with open(fpath) as f:
99
+ llm_features_loaded[varname] = json.load(f)
100
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
101
+ else:
102
+ llm_features_loaded[varname] = {}
103
+
104
+ # Load raw data to get ED2 round info
105
+ import re
106
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
107
+ round_lookup = {}
108
+ if os.path.exists(RAW_CSV):
109
+ print(f"\n Loading raw CSV for ED2 round info...")
110
+ try:
111
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
112
+ dtype=str, chunksize=500)
113
+ for chunk in raw_chunks:
114
+ for _, row in chunk.iterrows():
115
+ sid = str(row.get('student_id', '')).replace('.0', '')
116
+ summary = str(row.get('school_results_summary', ''))
117
+ entries = re.split(r'(?=\d+\.)', summary)
118
+ for entry in entries:
119
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
120
+ if m:
121
+ round_type = m.group(1)
122
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
123
+ if school_m:
124
+ school_name = school_m.group(1).strip()
125
+ key = f"{sid}_{school_name}"
126
+ round_lookup[key] = round_type
127
+ print(f" Round lookup built: {len(round_lookup)} entries")
128
+ except Exception as e:
129
+ print(f" Warning: Could not load raw CSV: {e}")
130
+
131
+ # ============================================================
132
+ # 2. DATA CLEANING & QUALITY FIXES
133
+ # ============================================================
134
+ print(f"\n{'='*70}")
135
+ print(f" DATA QUALITY FIXES")
136
+ print(f"{'='*70}")
137
+
138
+ # 2a. Filter years
139
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy()
140
+ df = df.reset_index(drop=True)
141
+ print(f"After filtering 2018-2019: {df.shape}")
142
+
143
+ # 2b. FIX #1: SAT=0 -> NaN + has_sat
144
+ sat_zero = (df['sat'] == 0).sum()
145
+ df['has_sat'] = (df['sat'] > 0).astype(int)
146
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
147
+ print(f"\n FIX #1: SAT=0 -> NaN: {sat_zero} rows ({sat_zero/len(df)*100:.1f}%)")
148
+ print(f" has_sat=1: {df['has_sat'].sum()}, has_sat=0: {(df['has_sat']==0).sum()}")
149
+
150
+ # 2c. FIX #2: TOEFL=0 -> NaN + has_toefl
151
+ toefl_zero = (df['toefl'] == 0).sum()
152
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
153
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
154
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows ({toefl_zero/len(df)*100:.1f}%)")
155
+
156
+ # 2d. FIX #3: GPA=0 -> NaN (v5 already has has_gpa)
157
+ gpa_zero = (df['gpa'] == 0).sum()
158
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
159
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows ({gpa_zero/len(df)*100:.1f}%)")
160
+ if 'has_gpa' not in df.columns:
161
+ df['has_gpa'] = df['gpa'].notna().astype(int)
162
+ print(f" has_gpa=1: {(df['has_gpa']==1).sum()}, has_gpa=0: {(df['has_gpa']==0).sum()}")
163
+
164
+ # 2e. FIX #4: -1 -> NaN for sentinel columns
165
+ sentinel_cols = ['taste_yearly_admits_log']
166
+ # v5 removed hs_to_univ_hist_rate, hs_to_univ_hist_rate_smoothed, hs_overall_hist_rate
167
+ # but check if they exist
168
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
169
+ if col in df.columns:
170
+ sentinel_cols.append(col)
171
+
172
+ for col in sentinel_cols:
173
+ if col in df.columns:
174
+ n_neg1 = (df[col] == -1).sum()
175
+ df.loc[df[col] == -1, col] = np.nan
176
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows ({n_neg1/len(df)*100:.1f}%)")
177
+
178
+ # 2f. FIX #5: has_ps=0 -> ps_bert all NaN
179
+ act_bert_cols = [c for c in df.columns if c.startswith('act_bert_pca_')]
180
+ # ps_bert_pca columns were removed in v5, but check
181
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
182
+ if ps_bert_cols:
183
+ no_ps_mask = df['has_ps'] == 0
184
+ n_fix = no_ps_mask.sum()
185
+ for col in ps_bert_cols:
186
+ df.loc[no_ps_mask, col] = np.nan
187
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {n_fix} rows, {len(ps_bert_cols)} columns")
188
+ else:
189
+ print(f" FIX #5: No ps_bert_pca columns in v5 (already removed)")
190
+
191
+ # 2g. FIX portfolio_size: log transform + cap (from V2)
192
+ print(f"\n Portfolio size transform:")
193
+ print(f" Before: mean={df['portfolio_size'].mean():.1f}, max={df['portfolio_size'].max():.0f}")
194
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
195
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
196
+ print(f" After log(clip(x,20)): mean={df['portfolio_size'].mean():.2f}, max={df['portfolio_size'].max():.2f}")
197
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
198
+ bins=[0, 5, 10, 15, 20, 100],
199
+ labels=[0, 1, 2, 3, 4]).astype(int)
200
+
201
+ # 2h. ED2 split (from V2)
202
+ def get_detailed_round(row):
203
+ sid = str(row.get('student_id', '')).replace('.0', '')
204
+ school = str(row.get('school', ''))
205
+ key = f"{sid}_{school}"
206
+ raw_round = round_lookup.get(key, '')
207
+ if 'Early Decision II' in raw_round:
208
+ return 'ED2'
209
+ elif 'Early Decision' in raw_round:
210
+ return 'ED1'
211
+ elif 'Restrictive Early Action' in raw_round:
212
+ return 'REA'
213
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round:
214
+ return 'EA'
215
+ elif 'Regular Decision' in raw_round:
216
+ return 'RD'
217
+ # Fall back to original round_cat
218
+ orig = str(row.get('round_cat', 'RD'))
219
+ if orig == 'ED':
220
+ return 'ED1'
221
+ return orig
222
+
223
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
224
+ print(f"\n Round distribution (v2):")
225
+ print(df['round_cat_v2'].value_counts().to_string())
226
+
227
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
228
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
229
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
230
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
231
+ df['round_cat'] = df['round_cat_v2']
232
+
233
+ # ============================================================
234
+ # 3. PARSE LLM FEATURES
235
+ # ============================================================
236
+ act_scores = {}
237
+ raw = llm_features_loaded.get('act_scores', {})
238
+ if isinstance(raw, list):
239
+ for item in raw:
240
+ if isinstance(item, dict) and item.get('success', False):
241
+ sid_raw = str(item.get('student_id', ''))
242
+ act_scores[sid_raw] = item
243
+ parts = sid_raw.split('_')
244
+ for p in parts:
245
+ clean = p.replace('.0', '')
246
+ if clean.isdigit():
247
+ act_scores[clean] = item
248
+ elif isinstance(raw, dict):
249
+ for sid, scores in raw.items():
250
+ if isinstance(scores, dict):
251
+ act_scores[sid] = scores
252
+
253
+ supp_scores = {}
254
+ raw = llm_features_loaded.get('supp_scores', {})
255
+ if isinstance(raw, list):
256
+ for item in raw:
257
+ if isinstance(item, dict) and item.get('success', False):
258
+ sid = str(item.get('student_id', '')).replace('.0', '')
259
+ school = str(item.get('school', ''))
260
+ key = f"{sid}_{school}"
261
+ # FIX: Filter out score=1 entries (no supplement, not real scores)
262
+ oq = item.get('overall_quality', 0)
263
+ if isinstance(oq, (int, float)) and oq <= 1:
264
+ continue # Skip fake scores
265
+ supp_scores[key] = item
266
+ elif isinstance(raw, dict):
267
+ for key, scores in raw.items():
268
+ if isinstance(scores, dict):
269
+ oq = scores.get('overall_quality', 0)
270
+ if isinstance(oq, (int, float)) and oq <= 1:
271
+ continue # Skip fake scores
272
+ supp_scores[key] = scores
273
+ print(f" Supp scores after filtering score=1: {len(supp_scores)} valid entries")
274
+
275
+ major_diff = llm_features_loaded.get('major_diff', {})
276
+ if isinstance(major_diff, list):
277
+ major_diff = {}
278
+
279
+ ps_yale = {}
280
+ raw = llm_features_loaded.get('ps_yale', {})
281
+ if isinstance(raw, list):
282
+ for item in raw:
283
+ if isinstance(item, dict):
284
+ sid = str(item.get('student_id', '')).replace('.0', '')
285
+ ps_yale[sid] = item
286
+ elif isinstance(raw, dict):
287
+ ps_yale = raw
288
+
289
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
290
+
291
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
292
+ 'n_founder', 'n_president', 'max_scope',
293
+ 'has_publication', 'has_patent', 'has_summer_program',
294
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
295
+ 'activity_coherence', 'spike_strength']
296
+
297
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
298
+ 'has_imagination_scene', 'mentions_specific_course',
299
+ 'mentions_specific_professor', 'mentions_specific_program',
300
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
301
+
302
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
303
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
304
+ and not k.startswith('is_')]
305
+ if not PS_DIMS:
306
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
307
+ 'coherence_focus', 'overall_effectiveness']
308
+
309
+ # ============================================================
310
+ # 4. DEFINE FEATURE GROUPS (adapted for v6 = v5 + restored v4 cols)
311
+ # ============================================================
312
+ STUDENT_LEVEL_NUMERIC = [
313
+ 'toefl', 'sat', 'gpa',
314
+ 'act_total_count', 'act_type_diversity',
315
+ *[f'act_slot_pca_{i}' for i in range(20)],
316
+ *[f'act_bert_pca_{i}' for i in range(16)],
317
+ # Restored from v4: ps_bert_pca
318
+ *[f'ps_bert_pca_{i}' for i in range(16)],
319
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
320
+ 'honors_count', 'honors_total_score',
321
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
322
+ 'honors_has_national',
323
+ # NEW: honors quality ratio
324
+ 'honors_quality_ratio',
325
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
326
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
327
+ 'cuilu_hs_total',
328
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
329
+ # Restored from v4: hs_to_univ_hist
330
+ 'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
331
+ 'hs_overall_hist_rate',
332
+ # Restored from v4: summer features
333
+ 'summer_max_geili', 'summer_has_elite', 'summer_count',
334
+ 'summer_program_count', 'summer_difficulty_max',
335
+ ]
336
+
337
+ # Add act_type_count columns dynamically
338
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
339
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
340
+
341
+ # Filter to only existing columns
342
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
343
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
344
+
345
+ KEY_STUDENT_FEATURES = [
346
+ 'toefl', 'sat', 'gpa',
347
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
348
+ 'honors_quality_ratio',
349
+ 'act_type_diversity', 'act_total_count',
350
+ # Restored high-value features
351
+ 'hs_to_univ_hist_rate_smoothed',
352
+ 'summer_max_geili',
353
+ # Admitted similarity features (time-respecting)
354
+ 'act_bert_admit_sim', 'ps_bert_admit_sim', 'act_slot_admit_sim',
355
+ 'ps_bert_sim_contrast',
356
+ # Grok PS scores
357
+ 'grok_ps_overall', 'grok_ps_originality',
358
+ ]
359
+
360
+ LLM_INTERACTION_FEATURES = [
361
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
362
+ 'supp_mean', 'supp_max', 'ps_mean',
363
+ 'major_difficulty',
364
+ # V5 additions
365
+ 'grok_ps_overall', 'grok_ps_originality', 'grok_ps_mean',
366
+ 'act_bert_admit_sim', 'ps_bert_admit_sim', 'ps_bert_sim_contrast',
367
+ ]
368
+
369
+ # ============================================================
370
+ # 5. BUILD FEATURES
371
+ # ============================================================
372
+ def build_features_base(df):
373
+ """Build base features WITHOUT residualization."""
374
+ df = df.copy()
375
+
376
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
377
+ df['year_cat'] = df['year'].astype(str)
378
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
379
+
380
+ # LLM Activity features
381
+ for dim in ACT_DIMS:
382
+ col_name = f'llm_act_{dim}'
383
+ df[col_name] = df['sid_str'].map(
384
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
385
+
386
+ # LLM Supp features
387
+ def get_supp_score(row, dim):
388
+ key = f"{row['sid_str']}_{row['school']}"
389
+ return safe_num(supp_scores.get(key, {}).get(dim, np.nan))
390
+ for dim in SUPP_DIMS:
391
+ col_name = f'supp_{dim}'
392
+ df[col_name] = df.apply(lambda r, d=dim: get_supp_score(r, d), axis=1)
393
+
394
+ # Major difficulty
395
+ def get_major_diff(row):
396
+ key = f"{row['school']}_{row['major_cat']}"
397
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
398
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
399
+
400
+ # PS Yale scores
401
+ for dim in PS_DIMS:
402
+ col_name = f'ps_{dim}'
403
+ df[col_name] = df['sid_str'].map(
404
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
405
+
406
+ # Grok 4.2 PS scores (NEW in V5)
407
+ grok_ps = llm_features_loaded.get('grok_ps_scores', {})
408
+ grok_ps_dims = ['overall_score', 'originality_depth', 'reflection_depth', 'authenticity_structure']
409
+ for dim in grok_ps_dims:
410
+ col_name = f'grok_ps_{dim.replace("_score", "").replace("_depth", "")}'
411
+ df[col_name] = df['sid_str'].map(
412
+ lambda s, d=dim: safe_num(grok_ps.get(s, {}).get(d, np.nan)) if grok_ps.get(s, {}).get('success') else np.nan)
413
+
414
+ # Grok PS red flags as binary features (from v7 CSV, already present)
415
+ # ps_flag_* columns are already in v7 matrix
416
+
417
+ # Grok PS aggregates
418
+ grok_cols = [c for c in df.columns if c.startswith('grok_ps_')]
419
+ if grok_cols:
420
+ df['grok_ps_mean'] = df[grok_cols].mean(axis=1)
421
+ print(f" Grok PS scores: {len(grok_cols)} dims, {df['grok_ps_mean'].notna().sum()} valid rows")
422
+
423
+ # Aggregates (use NaN-aware operations)
424
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
425
+ valid_act = df[llm_act_cols]
426
+ df['llm_act_mean'] = valid_act.mean(axis=1)
427
+ df['llm_act_max'] = valid_act.max(axis=1)
428
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
429
+
430
+ supp_num_cols = [f'supp_{d}' for d in SUPP_DIMS if d not in ['has_red_flag']]
431
+ valid_supp = df[supp_num_cols]
432
+ df['supp_mean'] = valid_supp.mean(axis=1)
433
+ df['supp_max'] = valid_supp.max(axis=1)
434
+
435
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
436
+ valid_ps = df[ps_cols]
437
+ df['ps_mean'] = valid_ps.mean(axis=1)
438
+
439
+ # Basic interactions (NaN-safe: NaN * anything = NaN, which is fine)
440
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
441
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
442
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
443
+
444
+ if 'honors_avg_score' in df.columns:
445
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
446
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
447
+
448
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
449
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
450
+
451
+ # Categoricals - with round_cat v2 (EA/ED1/ED2/REA/RD)
452
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
453
+ cat_cols = [c for c in cat_cols if c in df.columns]
454
+
455
+ if 'round_cat' in df.columns:
456
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
457
+ cat_cols.append('school_round')
458
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
459
+ cat_cols.append('school_major')
460
+ if 'hs_cat' in df.columns:
461
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
462
+ cat_cols.append('school_hstype')
463
+
464
+ for c in cat_cols:
465
+ df[c] = df[c].fillna('_MISSING_').astype(str)
466
+ le = LabelEncoder()
467
+ df[c] = le.fit_transform(df[c]).astype(int)
468
+
469
+ return df, cat_cols
470
+
471
+
472
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
473
+ """Add residualized + interaction + ED boost features using ONLY training data statistics.
474
+ KEY FIX: Residualization uses only non-NaN values for school means."""
475
+ df = df.copy()
476
+
477
+ # Step 1: Bayesian-smoothed school_base_rate
478
+ train_df = df[train_mask]
479
+ global_rate = train_df[TARGET].mean()
480
+
481
+ school_stats = train_df.groupby('school').agg(
482
+ school_raw_rate=(TARGET, 'mean'),
483
+ school_n_apps=(TARGET, 'count'),
484
+ school_n_admits=(TARGET, 'sum'),
485
+ ).reset_index()
486
+
487
+ SMOOTH_STRENGTH = 30
488
+ school_stats['school_base_rate'] = (
489
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
490
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
491
+ )
492
+
493
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
494
+ on='school', how='left')
495
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
496
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
497
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
498
+
499
+ # Step 1b: ED boost per school
500
+ ed1_mask = train_df['is_ed1'] == 1
501
+ rd_mask = train_df['is_early'] == 0
502
+
503
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
504
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
505
+
506
+ ed_boost_map = {}
507
+ for school in ed1_school_rates.index:
508
+ if school in rd_school_rates.index:
509
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
510
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
511
+
512
+ ed2_mask = train_df['is_ed2'] == 1
513
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
514
+ ed2_boost_map = {}
515
+ for school in ed2_school_rates.index:
516
+ if school in rd_school_rates.index:
517
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
518
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
519
+
520
+ # Step 2: Residualize student features
521
+ # KEY FIX: Use only non-NaN values for school means
522
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
523
+
524
+ resid_cols = []
525
+ for col in student_feat_available:
526
+ resid_col = f'{col}_resid'
527
+ # Compute school mean using ONLY non-NaN training values
528
+ school_mean_series = train_df.groupby('school')[col].mean() # NaN excluded by default
529
+ col_school_mean = df['school'].map(school_mean_series)
530
+ # Residual: student value - school mean (NaN if either is NaN)
531
+ df[resid_col] = df[col] - col_school_mean
532
+ resid_cols.append(resid_col)
533
+
534
+ # Step 3: Explicit interactions (student feature x school_base_rate)
535
+ interaction_cols = []
536
+ for col in KEY_STUDENT_FEATURES:
537
+ if col in df.columns:
538
+ int_col = f'{col}_x_school_rate'
539
+ df[int_col] = df[col] * df['school_base_rate'] # NaN propagates naturally
540
+ interaction_cols.append(int_col)
541
+
542
+ resid_col = f'{col}_resid'
543
+ if resid_col in df.columns:
544
+ int_resid_col = f'{col}_resid_x_rate'
545
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
546
+ interaction_cols.append(int_resid_col)
547
+
548
+ # Step 3b: LLM feature x school_base_rate interactions
549
+ for col in LLM_INTERACTION_FEATURES:
550
+ if col in df.columns:
551
+ int_col = f'{col}_x_school_rate'
552
+ df[int_col] = df[col] * df['school_base_rate']
553
+ interaction_cols.append(int_col)
554
+
555
+ # Step 3c: portfolio_size x school_base_rate interaction
556
+ if 'portfolio_size' in df.columns:
557
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
558
+ interaction_cols.append('portfolio_x_school_rate')
559
+
560
+ # Step 3d: ED flag x school_ed_boost interaction
561
+ if 'is_ed1' in df.columns:
562
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
563
+ interaction_cols.append('ed1_x_ed_boost')
564
+ if 'is_ed2' in df.columns:
565
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
566
+ interaction_cols.append('ed2_x_ed2_boost')
567
+
568
+ # Step 3e: has_sat/has_toefl/has_gpa interactions with school_base_rate
569
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
570
+ if flag in df.columns:
571
+ int_col = f'{flag}_x_school_rate'
572
+ df[int_col] = df[flag] * df['school_base_rate']
573
+ interaction_cols.append(int_col)
574
+
575
+ # Step 4: Student percentile within school (NaN-safe)
576
+ pctile_cols = []
577
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score',
578
+ 'llm_act_mean', 'supp_mean']:
579
+ if col not in df.columns:
580
+ continue
581
+ pctile_col = f'{col}_school_pctile'
582
+ school_distributions = {}
583
+ for school_id in train_df['school'].unique():
584
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
585
+ if len(vals) > 2:
586
+ school_distributions[school_id] = vals
587
+
588
+ def compute_pctile(row, col=col, sd=school_distributions):
589
+ school_id = row['school']
590
+ val = row[col]
591
+ if pd.isna(val) or school_id not in sd:
592
+ return np.nan # Return NaN instead of 0.5
593
+ dist = sd[school_id]
594
+ return np.mean(dist <= val)
595
+
596
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
597
+ pctile_cols.append(pctile_col)
598
+
599
+ # Step 5: Student competitiveness score (NaN-safe)
600
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
601
+ # Use NaN-safe computation
602
+ components = []
603
+ weights = []
604
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
605
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
606
+ if col in df.columns:
607
+ components.append(df[col] / scale)
608
+ weights.append(w)
609
+ if components:
610
+ strength_df = pd.DataFrame(components).T
611
+ df['student_strength'] = strength_df.mean(axis=1) # NaN-safe mean
612
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
613
+
614
+ # Build final feature list
615
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
616
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
617
+
618
+ all_feat = list(set(num_cols + cat_cols))
619
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
620
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
621
+ if remove in feature_cols:
622
+ feature_cols.remove(remove)
623
+
624
+ # Remove constant columns
625
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
626
+ feature_cols = [c for c in feature_cols if c not in to_drop]
627
+
628
+ # Apply feature selection if provided
629
+ if selected_features is not None:
630
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
631
+ 'student_strength', 'strength_vs_school',
632
+ 'school_ed_boost', 'school_ed2_boost',
633
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
634
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
635
+ 'has_sat', 'has_toefl', 'has_gpa',
636
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate'}
637
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
638
+
639
+ # KEY CHANGE: Do NOT fill NaN for CatBoost - it handles NaN natively
640
+ # Only fill NaN for LGB/XGB later, and only for non-cat columns
641
+ # For now, just handle inf
642
+ for c in feature_cols:
643
+ if df[c].dtype in ['float64', 'float32']:
644
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
645
+
646
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
647
+
648
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + 5
649
+ print(f" Resid features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile = {new_feat_count} new, total={len(feature_cols)}")
650
+
651
+ return df, feature_cols, cat_cols, cat_indices
652
+
653
+
654
+ # ============================================================
655
+ # 6. BUILD BASE FEATURES
656
+ # ============================================================
657
+ df_base, cat_cols = build_features_base(df)
658
+ print(f"\nBase features built. Shape: {df_base.shape}")
659
+
660
+ # Quick NaN summary
661
+ print(f"\n NaN summary after fixes:")
662
+ for col in ['sat', 'toefl', 'gpa']:
663
+ nan_pct = df_base[col].isna().mean() * 100
664
+ print(f" {col}: {nan_pct:.1f}% NaN")
665
+
666
+ y = df_base[TARGET].values
667
+ groups = df_base['student_id'].values
668
+
669
+ # ============================================================
670
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
671
+ # ============================================================
672
+ print(f"\n{'='*70}")
673
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
674
+ print(f"{'='*70}")
675
+
676
+ stage1_fi = []
677
+ gkf_s1 = GroupKFold(n_splits=5)
678
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
679
+ train_mask = pd.Series(False, index=df_base.index)
680
+ train_mask.iloc[tr_idx] = True
681
+
682
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
683
+ df_base, train_mask, cat_cols)
684
+
685
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
686
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
687
+ y_tr = y[tr_idx]
688
+ y_va = y[va_idx]
689
+
690
+ for c in cat_cols_f:
691
+ if c in X_tr.columns:
692
+ X_tr[c] = X_tr[c].astype(int)
693
+ X_va[c] = X_va[c].astype(int)
694
+ # CatBoost handles NaN natively - don't fill
695
+
696
+ cb = CatBoostClassifier(
697
+ iterations=500, depth=6, learning_rate=0.05,
698
+ l2_leaf_reg=7, random_seed=42, verbose=0,
699
+ cat_features=cat_idx_f, eval_metric='AUC',
700
+ early_stopping_rounds=50)
701
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
702
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
703
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
704
+
705
+ fi = cb.get_feature_importance()
706
+ stage1_fi.append(fi)
707
+
708
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
709
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
710
+
711
+ if fold == 0:
712
+ all_feature_names = feat_cols_f
713
+
714
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
715
+
716
+ # Select top features
717
+ avg_fi = np.mean(stage1_fi, axis=0)
718
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
719
+
720
+ selected_set = set(cat_cols)
721
+ n_added = 0
722
+ for fname, imp in fi_pairs:
723
+ if fname not in cat_cols:
724
+ selected_set.add(fname)
725
+ n_added += 1
726
+ if n_added >= FEATURE_SELECT_TOP_N:
727
+ break
728
+
729
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
730
+ print(f" Top 20 features:")
731
+ for i, (fname, imp) in enumerate(fi_pairs[:20]):
732
+ marker = ""
733
+ if '_resid' in fname: marker = " [R]"
734
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
735
+ elif '_school_pctile' in fname: marker = " [P]"
736
+ elif 'school_base_rate' in fname: marker = " [S]"
737
+ elif 'ed_boost' in fname: marker = " [ED]"
738
+ print(f" {i+1:3d}. {fname:<50s} {imp:>8.2f}{marker}")
739
+
740
+ # ============================================================
741
+ # 8. TEMPORAL VALIDATION WITH SELECTED FEATURES
742
+ # ============================================================
743
+ print(f"\n{'='*70}")
744
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024) WITH FEATURE SELECTION")
745
+ print(f"{'='*70}")
746
+
747
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
748
+ mask_test_temporal = df_base['year'] == 2024
749
+
750
+ temporal_results = {}
751
+ if mask_test_temporal.sum() > 0:
752
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
753
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
754
+
755
+ X_t = df_temporal[feat_cols_t].copy()
756
+ for c in cat_cols_t:
757
+ if c in X_t.columns:
758
+ X_t[c] = X_t[c].astype(int)
759
+
760
+ X_tr_t = X_t[mask_train_temporal]
761
+ X_te_t = X_t[mask_test_temporal]
762
+ y_tr_t = y[mask_train_temporal]
763
+ y_te_t = y[mask_test_temporal]
764
+
765
+ # For LGB/XGB: fill NaN with -999 (a value they can split on)
766
+ X_tr_t_filled = X_tr_t.fillna(-999)
767
+ X_te_t_filled = X_te_t.fillna(-999)
768
+
769
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
770
+
771
+ for seed in SEEDS:
772
+ # CatBoost: native NaN handling
773
+ cb_t = CatBoostClassifier(
774
+ iterations=1000, depth=6, learning_rate=0.03,
775
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
776
+ cat_features=cat_idx_t, eval_metric='AUC',
777
+ early_stopping_rounds=100, min_data_in_leaf=10)
778
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
779
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
780
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
781
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
782
+ del cb_t; gc.collect()
783
+
784
+ # LGB: use filled data
785
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
786
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
787
+ lgb_params = {
788
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
789
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
790
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
791
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
792
+ 'seed': seed
793
+ }
794
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
795
+ valid_sets=[lgb_va],
796
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
797
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
798
+ del lgb_model; gc.collect()
799
+
800
+ # XGB: use filled data
801
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
802
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
803
+ xgb_params = {
804
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
805
+ 'max_depth': 6, 'learning_rate': 0.03,
806
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
807
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
808
+ 'min_child_weight': 5,
809
+ 'seed': seed, 'verbosity': 0
810
+ }
811
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
812
+ evals=[(dtest, 'val')],
813
+ early_stopping_rounds=100, verbose_eval=False)
814
+ xgb_pred = xgb_model.predict(dtest)
815
+ del xgb_model, dtrain, dtest; gc.collect()
816
+
817
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
818
+ temporal_results[seed] = {
819
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
820
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
821
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
822
+ 'blend': float(roc_auc_score(y_te_t, blend))
823
+ }
824
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
825
+
826
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
827
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f} (V37.3: 0.8410, V38.2-PRO-V4: 0.8555)")
828
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
829
+ print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
830
+
831
+ del df_temporal, X_t; gc.collect()
832
+ else:
833
+ avg_temporal = 0.0
834
+
835
+ # ============================================================
836
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
837
+ # ============================================================
838
+ print(f"\n{'='*70}")
839
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
840
+ print(f"{'='*70}")
841
+
842
+ all_cb_oof = []
843
+ all_lgb_oof = []
844
+ all_xgb_oof = []
845
+ all_fi = []
846
+ feature_cols_final = None
847
+
848
+ for seed_idx, seed in enumerate(SEEDS):
849
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
850
+ gkf = GroupKFold(n_splits=N_FOLDS)
851
+ cb_oof = np.zeros(len(df_base))
852
+ lgb_oof = np.zeros(len(df_base))
853
+ xgb_oof = np.zeros(len(df_base))
854
+
855
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
856
+ train_mask = pd.Series(False, index=df_base.index)
857
+ train_mask.iloc[tr_idx] = True
858
+
859
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
860
+ df_base, train_mask, cat_cols, selected_features=selected_set)
861
+
862
+ if feature_cols_final is None:
863
+ feature_cols_final = feat_cols_f
864
+ print(f" Total features after selection: {len(feat_cols_f)}")
865
+
866
+ X_fold = df_fold[feat_cols_f].copy()
867
+ for c in cat_cols_f:
868
+ if c in X_fold.columns:
869
+ X_fold[c] = X_fold[c].astype(int)
870
+
871
+ X_tr_df = X_fold.iloc[tr_idx]
872
+ X_va_df = X_fold.iloc[va_idx]
873
+ y_tr = y[tr_idx]
874
+ y_va = y[va_idx]
875
+
876
+ # CatBoost: native NaN
877
+ cb = CatBoostClassifier(
878
+ iterations=1500, depth=6, learning_rate=0.03,
879
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
880
+ cat_features=cat_idx_f, eval_metric='AUC',
881
+ early_stopping_rounds=100, min_data_in_leaf=10)
882
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
883
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
884
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
885
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
886
+ cb_oof[va_idx] = cb_pred
887
+
888
+ if fold == N_FOLDS - 1:
889
+ all_fi.append(cb.get_feature_importance())
890
+ del cb, pool_tr, pool_va; gc.collect()
891
+
892
+ # LGB/XGB: fill NaN
893
+ X_tr_filled = X_tr_df.fillna(-999).values
894
+ X_va_filled = X_va_df.fillna(-999).values
895
+
896
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
897
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
898
+ lgb_params = {
899
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
900
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
901
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
902
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
903
+ 'seed': seed
904
+ }
905
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
906
+ valid_sets=[lgb_va_ds],
907
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
908
+ lgb_pred = lgb_model.predict(X_va_filled)
909
+ lgb_oof[va_idx] = lgb_pred
910
+ del lgb_model; gc.collect()
911
+
912
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
913
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
914
+ xgb_params = {
915
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
916
+ 'max_depth': 6, 'learning_rate': 0.03,
917
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
918
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
919
+ 'min_child_weight': 5,
920
+ 'seed': seed, 'verbosity': 0
921
+ }
922
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
923
+ evals=[(dval, 'val')],
924
+ early_stopping_rounds=100, verbose_eval=False)
925
+ xgb_pred = xgb_model.predict(dval)
926
+ xgb_oof[va_idx] = xgb_pred
927
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
928
+
929
+ if (fold + 1) % 5 == 0:
930
+ print(f" Fold {fold+1}/{N_FOLDS} done")
931
+
932
+ cb_auc = roc_auc_score(y, cb_oof)
933
+ lgb_auc = roc_auc_score(y, lgb_oof)
934
+ xgb_auc = roc_auc_score(y, xgb_oof)
935
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
936
+
937
+ all_cb_oof.append(cb_oof)
938
+ all_lgb_oof.append(lgb_oof)
939
+ all_xgb_oof.append(xgb_oof)
940
+
941
+ # ============================================================
942
+ # 10. ENSEMBLE & BLEND
943
+ # ============================================================
944
+ print(f"\n{'='*70}")
945
+ print(f" ENSEMBLE RESULTS")
946
+ print(f"{'='*70}")
947
+
948
+ cb_avg = np.mean(all_cb_oof, axis=0)
949
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
950
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
951
+
952
+ cb_final_auc = roc_auc_score(y, cb_avg)
953
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
954
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
955
+
956
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
957
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
958
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
959
+
960
+ best_auc = 0
961
+ best_weights = (0.45, 0.20, 0.35)
962
+ for w_cb in np.arange(0.2, 0.7, 0.05):
963
+ for w_lgb in np.arange(0.05, 0.5, 0.05):
964
+ w_xgb = 1.0 - w_cb - w_lgb
965
+ if w_xgb < 0.05: continue
966
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
967
+ auc = roc_auc_score(y, blend)
968
+ if auc > best_auc:
969
+ best_auc = auc
970
+ best_weights = (w_cb, w_lgb, w_xgb)
971
+
972
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
973
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
974
+ print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
975
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
976
+
977
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
978
+ rank_auc = roc_auc_score(y, rank_blend)
979
+ print(f" Rank blend: {rank_auc:.4f}")
980
+
981
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
982
+ final_auc = roc_auc_score(y, final_blend_prob)
983
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
984
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
985
+
986
+ print(f"\n FINAL METRICS:")
987
+ print(f" AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-PRO-V4: 0.8758)")
988
+ print(f" Brier: {final_brier:.4f}")
989
+ print(f" LogLoss: {final_logloss:.4f}")
990
+
991
+ # ============================================================
992
+ # 11. FEATURE IMPORTANCE
993
+ # ============================================================
994
+ print(f"\n{'='*70}")
995
+ print(f" FEATURE IMPORTANCE (avg across seeds)")
996
+ print(f"{'='*70}")
997
+
998
+ if feature_cols_final and all_fi:
999
+ avg_fi = np.mean(all_fi, axis=0)
1000
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
1001
+
1002
+ print(f" {'Rank':<5s} {'Feature':<50s} {'Importance':>10s}")
1003
+ print(f" {'-'*5} {'-'*50} {'-'*10}")
1004
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
1005
+ marker = ""
1006
+ if '_resid' in fname: marker = " [RESID]"
1007
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [INTERACT]"
1008
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
1009
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
1010
+ elif 'ed_boost' in fname or 'ed2_boost' in fname: marker = " [ED_BOOST]"
1011
+ elif fname.startswith('has_'): marker = " [FLAG]"
1012
+ print(f" {i+1:<5d} {fname:<50s} {imp:>10.2f}{marker}")
1013
+
1014
+ resid_in_top30 = sum(1 for f, _ in fi_pairs[:30]
1015
+ if '_resid' in f or '_x_school_rate' in f or '_school_pctile' in f or 'school_base_rate' in f)
1016
+ print(f"\n Residualized/interaction features in top 30: {resid_in_top30}")
1017
+
1018
+ # ============================================================
1019
+ # 12. SAVE RESULTS
1020
+ # ============================================================
1021
+ elapsed = time.time() - start_time
1022
+
1023
+ results = {
1024
+ 'version': 'V38.2-pro-v4',
1025
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1026
+ 'elapsed_minutes': elapsed / 60,
1027
+ 'changes': [
1028
+ 'Use v5 feature matrix (GPA fixed: 40.3% coverage)',
1029
+ 'SAT=0 -> NaN + has_sat flag',
1030
+ 'TOEFL=0 -> NaN + has_toefl flag',
1031
+ 'GPA=0 -> NaN + has_gpa flag',
1032
+ '-1 -> NaN for sentinel columns',
1033
+ 'Residualization uses only non-NaN values',
1034
+ 'CatBoost native NaN handling',
1035
+ 'LGB/XGB use -999 for NaN',
1036
+ 'Percentile returns NaN instead of 0.5 for missing',
1037
+ ],
1038
+ 'comparison': {
1039
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
1040
+ 'v38_2_pro_v3': {'auc': 0.8743, 'temporal_auc': 0.8528},
1041
+ },
1042
+ 'temporal_validation': {
1043
+ 'per_seed': temporal_results,
1044
+ 'avg_blend': float(avg_temporal),
1045
+ },
1046
+ 'groupkfold': {
1047
+ 'best_3model_blend': float(best_auc),
1048
+ 'best_weights': [float(w) for w in best_weights],
1049
+ 'rank_blend': float(rank_auc),
1050
+ },
1051
+ 'final_metrics': {
1052
+ 'auc': float(final_auc),
1053
+ 'brier': float(final_brier),
1054
+ 'logloss': float(final_logloss),
1055
+ },
1056
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1057
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1058
+ }
1059
+
1060
+ with open(os.path.join(OUTPUT_DIR, 'v38_2_pro_v4_results.json'), 'w') as f:
1061
+ json.dump(results, f, indent=2)
1062
+
1063
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1064
+ oof_df['cb_pred'] = cb_avg
1065
+ oof_df['lgb_pred'] = lgb_avg
1066
+ oof_df['xgb_pred'] = xgb_avg
1067
+ oof_df['final_pred'] = final_blend_prob
1068
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_v5_oof_predictions.csv'), index=False)
1069
+
1070
+ print(f"\n{'='*70}")
1071
+ print(f" V38.2-PRO-V5 COMPLETE")
1072
+ print(f" Total time: {elapsed/60:.1f} minutes")
1073
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1074
+ print(f" GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-PRO-V4: 0.8758)")
1075
+ print(f" Temporal AUC: {avg_temporal:.4f} (V37.3: 0.8410, V38.2-PRO-V4: 0.8555)")
1076
+ print(f"{'='*70}")