catninja123 commited on
Commit
d740537
·
verified ·
1 Parent(s): 8a960b8

Upload train_v38_2_pro_v3.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v3.py +1024 -0
train_v38_2_pro_v3.py ADDED
@@ -0,0 +1,1024 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V3 MODEL - Data Quality Fixes + v5 Feature Matrix
4
+ ====================================================================
5
+ Changes from V38.2-PRO-V2:
6
+ 1. Use v5 feature matrix (GPA fixed: 40.3% coverage, normalized to 4.0)
7
+ 2. SAT=0 -> NaN + has_sat flag (59.4% missing)
8
+ 3. TOEFL=0 -> NaN + has_toefl flag (40.4% missing)
9
+ 4. GPA=0 -> NaN (already has has_gpa in v5)
10
+ 5. -1 -> NaN for taste_yearly_admits_log, hs_overall_hist_rate (removed cols handled)
11
+ 6. has_ps=0 -> ps_bert_pca all NaN (fix false BERT signals)
12
+ 7. Residualization uses ONLY valid (non-NaN) values for school means
13
+ 8. CatBoost native NaN handling (no more fillna(-1) for key features)
14
+ ====================================================================
15
+ """
16
+ import pandas as pd
17
+ import numpy as np
18
+ import json, os, warnings, sys, time, pickle, gc
19
+ warnings.filterwarnings('ignore')
20
+ from sklearn.model_selection import GroupKFold
21
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
22
+ from sklearn.preprocessing import LabelEncoder
23
+ from scipy.stats import rankdata
24
+
25
+ try:
26
+ from catboost import CatBoostClassifier, Pool
27
+ import lightgbm as lgb
28
+ import xgboost as xgb
29
+ print("All model libraries loaded successfully")
30
+ except ImportError as e:
31
+ print(f"Missing library: {e}")
32
+ import subprocess
33
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
34
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
35
+ from catboost import CatBoostClassifier, Pool
36
+ import lightgbm as lgb
37
+ import xgboost as xgb
38
+
39
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
40
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
41
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
42
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
43
+
44
+ TARGET = 'target'
45
+ SEEDS = [42, 123, 456, 789, 2024]
46
+ N_FOLDS = 10
47
+ FEATURE_SELECT_TOP_N = 150
48
+ start_time = time.time()
49
+
50
+ def safe_num(v, default=np.nan):
51
+ """Convert to float, return NaN for missing (was -1 before)."""
52
+ if isinstance(v, (int, float)):
53
+ val = float(v)
54
+ return np.nan if val == -1 else val
55
+ if isinstance(v, str):
56
+ try:
57
+ val = float(v)
58
+ return np.nan if val == -1 else val
59
+ except:
60
+ return default
61
+ return default
62
+
63
+ # ============================================================
64
+ # 1. LOAD DATA (v5 feature matrix)
65
+ # ============================================================
66
+ print("=" * 70)
67
+ print(" V38.2-PRO-V3: DATA QUALITY FIXES + V5 FEATURE MATRIX")
68
+ print("=" * 70)
69
+
70
+ # Try v5 first, fall back to v4
71
+ v5_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v5.csv')
72
+ v4_path = os.path.join(DATA_DIR, 'v38_2_integrated_features.csv')
73
+ if os.path.exists(v5_path):
74
+ df_raw = pd.read_csv(v5_path)
75
+ print(f"V5 features loaded: {df_raw.shape}")
76
+ else:
77
+ df_raw = pd.read_csv(v4_path)
78
+ print(f"V4 features loaded (v5 not found): {df_raw.shape}")
79
+
80
+ # Load LLM features
81
+ llm_features_loaded = {}
82
+ for fname, varname in [
83
+ ('llm_activity_scores.json', 'act_scores'),
84
+ ('llm_supp_quality_all.json', 'supp_scores'),
85
+ ('llm_major_difficulty.json', 'major_diff'),
86
+ ('ps_yale_scores.json', 'ps_yale'),
87
+ ]:
88
+ fpath = os.path.join(DATA_DIR, fname)
89
+ if os.path.exists(fpath):
90
+ with open(fpath) as f:
91
+ llm_features_loaded[varname] = json.load(f)
92
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
93
+ else:
94
+ llm_features_loaded[varname] = {}
95
+
96
+ # Load raw data to get ED2 round info
97
+ import re
98
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
99
+ round_lookup = {}
100
+ if os.path.exists(RAW_CSV):
101
+ print(f"\n Loading raw CSV for ED2 round info...")
102
+ try:
103
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
104
+ dtype=str, chunksize=500)
105
+ for chunk in raw_chunks:
106
+ for _, row in chunk.iterrows():
107
+ sid = str(row.get('student_id', '')).replace('.0', '')
108
+ summary = str(row.get('school_results_summary', ''))
109
+ entries = re.split(r'(?=\d+\.)', summary)
110
+ for entry in entries:
111
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
112
+ if m:
113
+ round_type = m.group(1)
114
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
115
+ if school_m:
116
+ school_name = school_m.group(1).strip()
117
+ key = f"{sid}_{school_name}"
118
+ round_lookup[key] = round_type
119
+ print(f" Round lookup built: {len(round_lookup)} entries")
120
+ except Exception as e:
121
+ print(f" Warning: Could not load raw CSV: {e}")
122
+
123
+ # ============================================================
124
+ # 2. DATA CLEANING & QUALITY FIXES
125
+ # ============================================================
126
+ print(f"\n{'='*70}")
127
+ print(f" DATA QUALITY FIXES")
128
+ print(f"{'='*70}")
129
+
130
+ # 2a. Filter years
131
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy()
132
+ df = df.reset_index(drop=True)
133
+ print(f"After filtering 2018-2019: {df.shape}")
134
+
135
+ # 2b. FIX #1: SAT=0 -> NaN + has_sat
136
+ sat_zero = (df['sat'] == 0).sum()
137
+ df['has_sat'] = (df['sat'] > 0).astype(int)
138
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
139
+ print(f"\n FIX #1: SAT=0 -> NaN: {sat_zero} rows ({sat_zero/len(df)*100:.1f}%)")
140
+ print(f" has_sat=1: {df['has_sat'].sum()}, has_sat=0: {(df['has_sat']==0).sum()}")
141
+
142
+ # 2c. FIX #2: TOEFL=0 -> NaN + has_toefl
143
+ toefl_zero = (df['toefl'] == 0).sum()
144
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
145
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
146
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows ({toefl_zero/len(df)*100:.1f}%)")
147
+
148
+ # 2d. FIX #3: GPA=0 -> NaN (v5 already has has_gpa)
149
+ gpa_zero = (df['gpa'] == 0).sum()
150
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
151
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows ({gpa_zero/len(df)*100:.1f}%)")
152
+ if 'has_gpa' not in df.columns:
153
+ df['has_gpa'] = df['gpa'].notna().astype(int)
154
+ print(f" has_gpa=1: {(df['has_gpa']==1).sum()}, has_gpa=0: {(df['has_gpa']==0).sum()}")
155
+
156
+ # 2e. FIX #4: -1 -> NaN for sentinel columns
157
+ sentinel_cols = ['taste_yearly_admits_log']
158
+ # v5 removed hs_to_univ_hist_rate, hs_to_univ_hist_rate_smoothed, hs_overall_hist_rate
159
+ # but check if they exist
160
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
161
+ if col in df.columns:
162
+ sentinel_cols.append(col)
163
+
164
+ for col in sentinel_cols:
165
+ if col in df.columns:
166
+ n_neg1 = (df[col] == -1).sum()
167
+ df.loc[df[col] == -1, col] = np.nan
168
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows ({n_neg1/len(df)*100:.1f}%)")
169
+
170
+ # 2f. FIX #5: has_ps=0 -> ps_bert all NaN
171
+ act_bert_cols = [c for c in df.columns if c.startswith('act_bert_pca_')]
172
+ # ps_bert_pca columns were removed in v5, but check
173
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
174
+ if ps_bert_cols:
175
+ no_ps_mask = df['has_ps'] == 0
176
+ n_fix = no_ps_mask.sum()
177
+ for col in ps_bert_cols:
178
+ df.loc[no_ps_mask, col] = np.nan
179
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {n_fix} rows, {len(ps_bert_cols)} columns")
180
+ else:
181
+ print(f" FIX #5: No ps_bert_pca columns in v5 (already removed)")
182
+
183
+ # 2g. FIX portfolio_size: log transform + cap (from V2)
184
+ print(f"\n Portfolio size transform:")
185
+ print(f" Before: mean={df['portfolio_size'].mean():.1f}, max={df['portfolio_size'].max():.0f}")
186
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
187
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
188
+ print(f" After log(clip(x,20)): mean={df['portfolio_size'].mean():.2f}, max={df['portfolio_size'].max():.2f}")
189
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
190
+ bins=[0, 5, 10, 15, 20, 100],
191
+ labels=[0, 1, 2, 3, 4]).astype(int)
192
+
193
+ # 2h. ED2 split (from V2)
194
+ def get_detailed_round(row):
195
+ sid = str(row.get('student_id', '')).replace('.0', '')
196
+ school = str(row.get('school', ''))
197
+ key = f"{sid}_{school}"
198
+ raw_round = round_lookup.get(key, '')
199
+ if 'Early Decision II' in raw_round:
200
+ return 'ED2'
201
+ elif 'Early Decision' in raw_round:
202
+ return 'ED1'
203
+ elif 'Restrictive Early Action' in raw_round:
204
+ return 'REA'
205
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round:
206
+ return 'EA'
207
+ elif 'Regular Decision' in raw_round:
208
+ return 'RD'
209
+ # Fall back to original round_cat
210
+ orig = str(row.get('round_cat', 'RD'))
211
+ if orig == 'ED':
212
+ return 'ED1'
213
+ return orig
214
+
215
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
216
+ print(f"\n Round distribution (v2):")
217
+ print(df['round_cat_v2'].value_counts().to_string())
218
+
219
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
220
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
221
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
222
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
223
+ df['round_cat'] = df['round_cat_v2']
224
+
225
+ # ============================================================
226
+ # 3. PARSE LLM FEATURES
227
+ # ============================================================
228
+ act_scores = {}
229
+ raw = llm_features_loaded.get('act_scores', {})
230
+ if isinstance(raw, list):
231
+ for item in raw:
232
+ if isinstance(item, dict) and item.get('success', False):
233
+ sid_raw = str(item.get('student_id', ''))
234
+ act_scores[sid_raw] = item
235
+ parts = sid_raw.split('_')
236
+ for p in parts:
237
+ clean = p.replace('.0', '')
238
+ if clean.isdigit():
239
+ act_scores[clean] = item
240
+ elif isinstance(raw, dict):
241
+ for sid, scores in raw.items():
242
+ if isinstance(scores, dict):
243
+ act_scores[sid] = scores
244
+
245
+ supp_scores = {}
246
+ raw = llm_features_loaded.get('supp_scores', {})
247
+ if isinstance(raw, list):
248
+ for item in raw:
249
+ if isinstance(item, dict) and item.get('success', False):
250
+ sid = str(item.get('student_id', '')).replace('.0', '')
251
+ school = str(item.get('school', ''))
252
+ key = f"{sid}_{school}"
253
+ supp_scores[key] = item
254
+ elif isinstance(raw, dict):
255
+ for key, scores in raw.items():
256
+ if isinstance(scores, dict):
257
+ supp_scores[key] = scores
258
+
259
+ major_diff = llm_features_loaded.get('major_diff', {})
260
+ if isinstance(major_diff, list):
261
+ major_diff = {}
262
+
263
+ ps_yale = {}
264
+ raw = llm_features_loaded.get('ps_yale', {})
265
+ if isinstance(raw, list):
266
+ for item in raw:
267
+ if isinstance(item, dict):
268
+ sid = str(item.get('student_id', '')).replace('.0', '')
269
+ ps_yale[sid] = item
270
+ elif isinstance(raw, dict):
271
+ ps_yale = raw
272
+
273
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
274
+
275
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
276
+ 'n_founder', 'n_president', 'max_scope',
277
+ 'has_publication', 'has_patent', 'has_summer_program',
278
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
279
+ 'activity_coherence', 'spike_strength']
280
+
281
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
282
+ 'has_imagination_scene', 'mentions_specific_course',
283
+ 'mentions_specific_professor', 'mentions_specific_program',
284
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
285
+
286
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
287
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
288
+ and not k.startswith('is_')]
289
+ if not PS_DIMS:
290
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
291
+ 'coherence_focus', 'overall_effectiveness']
292
+
293
+ # ============================================================
294
+ # 4. DEFINE FEATURE GROUPS (adapted for v5)
295
+ # ============================================================
296
+ # v5 removed: ps_bert_pca_*, summer detailed features, hs_to_univ_hist_*
297
+ # v5 has different act_type_count names (double underscore)
298
+ STUDENT_LEVEL_NUMERIC = [
299
+ 'toefl', 'sat', 'gpa',
300
+ # act_type_count - use whatever exists in v5
301
+ 'act_total_count', 'act_type_diversity',
302
+ *[f'act_slot_pca_{i}' for i in range(20)],
303
+ *[f'act_bert_pca_{i}' for i in range(16)],
304
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
305
+ 'honors_count', 'honors_total_score',
306
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
307
+ 'honors_has_national',
308
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
309
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
310
+ 'cuilu_hs_total',
311
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
312
+ ]
313
+
314
+ # Add act_type_count columns dynamically
315
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
316
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
317
+
318
+ # Filter to only existing columns
319
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
320
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
321
+
322
+ KEY_STUDENT_FEATURES = [
323
+ 'toefl', 'sat', 'gpa',
324
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
325
+ 'act_type_diversity', 'act_total_count',
326
+ ]
327
+
328
+ LLM_INTERACTION_FEATURES = [
329
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
330
+ 'supp_mean', 'supp_max', 'ps_mean',
331
+ 'major_difficulty',
332
+ ]
333
+
334
+ # ============================================================
335
+ # 5. BUILD FEATURES
336
+ # ============================================================
337
+ def build_features_base(df):
338
+ """Build base features WITHOUT residualization."""
339
+ df = df.copy()
340
+
341
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
342
+ df['year_cat'] = df['year'].astype(str)
343
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
344
+
345
+ # LLM Activity features
346
+ for dim in ACT_DIMS:
347
+ col_name = f'llm_act_{dim}'
348
+ df[col_name] = df['sid_str'].map(
349
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
350
+
351
+ # LLM Supp features
352
+ def get_supp_score(row, dim):
353
+ key = f"{row['sid_str']}_{row['school']}"
354
+ return safe_num(supp_scores.get(key, {}).get(dim, np.nan))
355
+ for dim in SUPP_DIMS:
356
+ col_name = f'supp_{dim}'
357
+ df[col_name] = df.apply(lambda r, d=dim: get_supp_score(r, d), axis=1)
358
+
359
+ # Major difficulty
360
+ def get_major_diff(row):
361
+ key = f"{row['school']}_{row['major_cat']}"
362
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
363
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
364
+
365
+ # PS Yale scores
366
+ for dim in PS_DIMS:
367
+ col_name = f'ps_{dim}'
368
+ df[col_name] = df['sid_str'].map(
369
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
370
+
371
+ # Aggregates (use NaN-aware operations)
372
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
373
+ valid_act = df[llm_act_cols]
374
+ df['llm_act_mean'] = valid_act.mean(axis=1)
375
+ df['llm_act_max'] = valid_act.max(axis=1)
376
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
377
+
378
+ supp_num_cols = [f'supp_{d}' for d in SUPP_DIMS if d not in ['has_red_flag']]
379
+ valid_supp = df[supp_num_cols]
380
+ df['supp_mean'] = valid_supp.mean(axis=1)
381
+ df['supp_max'] = valid_supp.max(axis=1)
382
+
383
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
384
+ valid_ps = df[ps_cols]
385
+ df['ps_mean'] = valid_ps.mean(axis=1)
386
+
387
+ # Basic interactions (NaN-safe: NaN * anything = NaN, which is fine)
388
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
389
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
390
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
391
+
392
+ if 'honors_avg_score' in df.columns:
393
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
394
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
395
+
396
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
397
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
398
+
399
+ # Categoricals - with round_cat v2 (EA/ED1/ED2/REA/RD)
400
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
401
+ cat_cols = [c for c in cat_cols if c in df.columns]
402
+
403
+ if 'round_cat' in df.columns:
404
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
405
+ cat_cols.append('school_round')
406
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
407
+ cat_cols.append('school_major')
408
+ if 'hs_cat' in df.columns:
409
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
410
+ cat_cols.append('school_hstype')
411
+
412
+ for c in cat_cols:
413
+ df[c] = df[c].fillna('_MISSING_').astype(str)
414
+ le = LabelEncoder()
415
+ df[c] = le.fit_transform(df[c]).astype(int)
416
+
417
+ return df, cat_cols
418
+
419
+
420
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
421
+ """Add residualized + interaction + ED boost features using ONLY training data statistics.
422
+ KEY FIX: Residualization uses only non-NaN values for school means."""
423
+ df = df.copy()
424
+
425
+ # Step 1: Bayesian-smoothed school_base_rate
426
+ train_df = df[train_mask]
427
+ global_rate = train_df[TARGET].mean()
428
+
429
+ school_stats = train_df.groupby('school').agg(
430
+ school_raw_rate=(TARGET, 'mean'),
431
+ school_n_apps=(TARGET, 'count'),
432
+ school_n_admits=(TARGET, 'sum'),
433
+ ).reset_index()
434
+
435
+ SMOOTH_STRENGTH = 30
436
+ school_stats['school_base_rate'] = (
437
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
438
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
439
+ )
440
+
441
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
442
+ on='school', how='left')
443
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
444
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
445
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
446
+
447
+ # Step 1b: ED boost per school
448
+ ed1_mask = train_df['is_ed1'] == 1
449
+ rd_mask = train_df['is_early'] == 0
450
+
451
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
452
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
453
+
454
+ ed_boost_map = {}
455
+ for school in ed1_school_rates.index:
456
+ if school in rd_school_rates.index:
457
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
458
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
459
+
460
+ ed2_mask = train_df['is_ed2'] == 1
461
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
462
+ ed2_boost_map = {}
463
+ for school in ed2_school_rates.index:
464
+ if school in rd_school_rates.index:
465
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
466
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
467
+
468
+ # Step 2: Residualize student features
469
+ # KEY FIX: Use only non-NaN values for school means
470
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
471
+
472
+ resid_cols = []
473
+ for col in student_feat_available:
474
+ resid_col = f'{col}_resid'
475
+ # Compute school mean using ONLY non-NaN training values
476
+ school_mean_series = train_df.groupby('school')[col].mean() # NaN excluded by default
477
+ col_school_mean = df['school'].map(school_mean_series)
478
+ # Residual: student value - school mean (NaN if either is NaN)
479
+ df[resid_col] = df[col] - col_school_mean
480
+ resid_cols.append(resid_col)
481
+
482
+ # Step 3: Explicit interactions (student feature x school_base_rate)
483
+ interaction_cols = []
484
+ for col in KEY_STUDENT_FEATURES:
485
+ if col in df.columns:
486
+ int_col = f'{col}_x_school_rate'
487
+ df[int_col] = df[col] * df['school_base_rate'] # NaN propagates naturally
488
+ interaction_cols.append(int_col)
489
+
490
+ resid_col = f'{col}_resid'
491
+ if resid_col in df.columns:
492
+ int_resid_col = f'{col}_resid_x_rate'
493
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
494
+ interaction_cols.append(int_resid_col)
495
+
496
+ # Step 3b: LLM feature x school_base_rate interactions
497
+ for col in LLM_INTERACTION_FEATURES:
498
+ if col in df.columns:
499
+ int_col = f'{col}_x_school_rate'
500
+ df[int_col] = df[col] * df['school_base_rate']
501
+ interaction_cols.append(int_col)
502
+
503
+ # Step 3c: portfolio_size x school_base_rate interaction
504
+ if 'portfolio_size' in df.columns:
505
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
506
+ interaction_cols.append('portfolio_x_school_rate')
507
+
508
+ # Step 3d: ED flag x school_ed_boost interaction
509
+ if 'is_ed1' in df.columns:
510
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
511
+ interaction_cols.append('ed1_x_ed_boost')
512
+ if 'is_ed2' in df.columns:
513
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
514
+ interaction_cols.append('ed2_x_ed2_boost')
515
+
516
+ # Step 3e: has_sat/has_toefl/has_gpa interactions with school_base_rate
517
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
518
+ if flag in df.columns:
519
+ int_col = f'{flag}_x_school_rate'
520
+ df[int_col] = df[flag] * df['school_base_rate']
521
+ interaction_cols.append(int_col)
522
+
523
+ # Step 4: Student percentile within school (NaN-safe)
524
+ pctile_cols = []
525
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score',
526
+ 'llm_act_mean', 'supp_mean']:
527
+ if col not in df.columns:
528
+ continue
529
+ pctile_col = f'{col}_school_pctile'
530
+ school_distributions = {}
531
+ for school_id in train_df['school'].unique():
532
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
533
+ if len(vals) > 2:
534
+ school_distributions[school_id] = vals
535
+
536
+ def compute_pctile(row, col=col, sd=school_distributions):
537
+ school_id = row['school']
538
+ val = row[col]
539
+ if pd.isna(val) or school_id not in sd:
540
+ return np.nan # Return NaN instead of 0.5
541
+ dist = sd[school_id]
542
+ return np.mean(dist <= val)
543
+
544
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
545
+ pctile_cols.append(pctile_col)
546
+
547
+ # Step 5: Student competitiveness score (NaN-safe)
548
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
549
+ # Use NaN-safe computation
550
+ components = []
551
+ weights = []
552
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
553
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
554
+ if col in df.columns:
555
+ components.append(df[col] / scale)
556
+ weights.append(w)
557
+ if components:
558
+ strength_df = pd.DataFrame(components).T
559
+ df['student_strength'] = strength_df.mean(axis=1) # NaN-safe mean
560
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
561
+
562
+ # Build final feature list
563
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
564
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
565
+
566
+ all_feat = list(set(num_cols + cat_cols))
567
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
568
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
569
+ if remove in feature_cols:
570
+ feature_cols.remove(remove)
571
+
572
+ # Remove constant columns
573
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
574
+ feature_cols = [c for c in feature_cols if c not in to_drop]
575
+
576
+ # Apply feature selection if provided
577
+ if selected_features is not None:
578
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
579
+ 'student_strength', 'strength_vs_school',
580
+ 'school_ed_boost', 'school_ed2_boost',
581
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
582
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
583
+ 'has_sat', 'has_toefl', 'has_gpa',
584
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate'}
585
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
586
+
587
+ # KEY CHANGE: Do NOT fill NaN for CatBoost - it handles NaN natively
588
+ # Only fill NaN for LGB/XGB later, and only for non-cat columns
589
+ # For now, just handle inf
590
+ for c in feature_cols:
591
+ if df[c].dtype in ['float64', 'float32']:
592
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
593
+
594
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
595
+
596
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + 5
597
+ print(f" Resid features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile = {new_feat_count} new, total={len(feature_cols)}")
598
+
599
+ return df, feature_cols, cat_cols, cat_indices
600
+
601
+
602
+ # ============================================================
603
+ # 6. BUILD BASE FEATURES
604
+ # ============================================================
605
+ df_base, cat_cols = build_features_base(df)
606
+ print(f"\nBase features built. Shape: {df_base.shape}")
607
+
608
+ # Quick NaN summary
609
+ print(f"\n NaN summary after fixes:")
610
+ for col in ['sat', 'toefl', 'gpa']:
611
+ nan_pct = df_base[col].isna().mean() * 100
612
+ print(f" {col}: {nan_pct:.1f}% NaN")
613
+
614
+ y = df_base[TARGET].values
615
+ groups = df_base['student_id'].values
616
+
617
+ # ============================================================
618
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
619
+ # ============================================================
620
+ print(f"\n{'='*70}")
621
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
622
+ print(f"{'='*70}")
623
+
624
+ stage1_fi = []
625
+ gkf_s1 = GroupKFold(n_splits=5)
626
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
627
+ train_mask = pd.Series(False, index=df_base.index)
628
+ train_mask.iloc[tr_idx] = True
629
+
630
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
631
+ df_base, train_mask, cat_cols)
632
+
633
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
634
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
635
+ y_tr = y[tr_idx]
636
+ y_va = y[va_idx]
637
+
638
+ for c in cat_cols_f:
639
+ if c in X_tr.columns:
640
+ X_tr[c] = X_tr[c].astype(int)
641
+ X_va[c] = X_va[c].astype(int)
642
+ # CatBoost handles NaN natively - don't fill
643
+
644
+ cb = CatBoostClassifier(
645
+ iterations=500, depth=6, learning_rate=0.05,
646
+ l2_leaf_reg=7, random_seed=42, verbose=0,
647
+ cat_features=cat_idx_f, eval_metric='AUC',
648
+ early_stopping_rounds=50)
649
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
650
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
651
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
652
+
653
+ fi = cb.get_feature_importance()
654
+ stage1_fi.append(fi)
655
+
656
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
657
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
658
+
659
+ if fold == 0:
660
+ all_feature_names = feat_cols_f
661
+
662
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
663
+
664
+ # Select top features
665
+ avg_fi = np.mean(stage1_fi, axis=0)
666
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
667
+
668
+ selected_set = set(cat_cols)
669
+ n_added = 0
670
+ for fname, imp in fi_pairs:
671
+ if fname not in cat_cols:
672
+ selected_set.add(fname)
673
+ n_added += 1
674
+ if n_added >= FEATURE_SELECT_TOP_N:
675
+ break
676
+
677
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
678
+ print(f" Top 20 features:")
679
+ for i, (fname, imp) in enumerate(fi_pairs[:20]):
680
+ marker = ""
681
+ if '_resid' in fname: marker = " [R]"
682
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
683
+ elif '_school_pctile' in fname: marker = " [P]"
684
+ elif 'school_base_rate' in fname: marker = " [S]"
685
+ elif 'ed_boost' in fname: marker = " [ED]"
686
+ print(f" {i+1:3d}. {fname:<50s} {imp:>8.2f}{marker}")
687
+
688
+ # ============================================================
689
+ # 8. TEMPORAL VALIDATION WITH SELECTED FEATURES
690
+ # ============================================================
691
+ print(f"\n{'='*70}")
692
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024) WITH FEATURE SELECTION")
693
+ print(f"{'='*70}")
694
+
695
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
696
+ mask_test_temporal = df_base['year'] == 2024
697
+
698
+ temporal_results = {}
699
+ if mask_test_temporal.sum() > 0:
700
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
701
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
702
+
703
+ X_t = df_temporal[feat_cols_t].copy()
704
+ for c in cat_cols_t:
705
+ if c in X_t.columns:
706
+ X_t[c] = X_t[c].astype(int)
707
+
708
+ X_tr_t = X_t[mask_train_temporal]
709
+ X_te_t = X_t[mask_test_temporal]
710
+ y_tr_t = y[mask_train_temporal]
711
+ y_te_t = y[mask_test_temporal]
712
+
713
+ # For LGB/XGB: fill NaN with -999 (a value they can split on)
714
+ X_tr_t_filled = X_tr_t.fillna(-999)
715
+ X_te_t_filled = X_te_t.fillna(-999)
716
+
717
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
718
+
719
+ for seed in SEEDS:
720
+ # CatBoost: native NaN handling
721
+ cb_t = CatBoostClassifier(
722
+ iterations=1000, depth=6, learning_rate=0.03,
723
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
724
+ cat_features=cat_idx_t, eval_metric='AUC',
725
+ early_stopping_rounds=100, min_data_in_leaf=10)
726
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
727
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
728
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
729
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
730
+ del cb_t; gc.collect()
731
+
732
+ # LGB: use filled data
733
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
734
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
735
+ lgb_params = {
736
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
737
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
738
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
739
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
740
+ 'seed': seed
741
+ }
742
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
743
+ valid_sets=[lgb_va],
744
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
745
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
746
+ del lgb_model; gc.collect()
747
+
748
+ # XGB: use filled data
749
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
750
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
751
+ xgb_params = {
752
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
753
+ 'max_depth': 6, 'learning_rate': 0.03,
754
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
755
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
756
+ 'min_child_weight': 5,
757
+ 'seed': seed, 'verbosity': 0
758
+ }
759
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
760
+ evals=[(dtest, 'val')],
761
+ early_stopping_rounds=100, verbose_eval=False)
762
+ xgb_pred = xgb_model.predict(dtest)
763
+ del xgb_model, dtrain, dtest; gc.collect()
764
+
765
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
766
+ temporal_results[seed] = {
767
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
768
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
769
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
770
+ 'blend': float(roc_auc_score(y_te_t, blend))
771
+ }
772
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
773
+
774
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
775
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f} (V37.3: 0.8410, V38.2-PRO-V2: 0.8469)")
776
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
777
+ print(f" Delta vs V38.2-PRO-V2: {avg_temporal - 0.8469:+.4f}")
778
+
779
+ del df_temporal, X_t; gc.collect()
780
+ else:
781
+ avg_temporal = 0.0
782
+
783
+ # ============================================================
784
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
785
+ # ============================================================
786
+ print(f"\n{'='*70}")
787
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
788
+ print(f"{'='*70}")
789
+
790
+ all_cb_oof = []
791
+ all_lgb_oof = []
792
+ all_xgb_oof = []
793
+ all_fi = []
794
+ feature_cols_final = None
795
+
796
+ for seed_idx, seed in enumerate(SEEDS):
797
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
798
+ gkf = GroupKFold(n_splits=N_FOLDS)
799
+ cb_oof = np.zeros(len(df_base))
800
+ lgb_oof = np.zeros(len(df_base))
801
+ xgb_oof = np.zeros(len(df_base))
802
+
803
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
804
+ train_mask = pd.Series(False, index=df_base.index)
805
+ train_mask.iloc[tr_idx] = True
806
+
807
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
808
+ df_base, train_mask, cat_cols, selected_features=selected_set)
809
+
810
+ if feature_cols_final is None:
811
+ feature_cols_final = feat_cols_f
812
+ print(f" Total features after selection: {len(feat_cols_f)}")
813
+
814
+ X_fold = df_fold[feat_cols_f].copy()
815
+ for c in cat_cols_f:
816
+ if c in X_fold.columns:
817
+ X_fold[c] = X_fold[c].astype(int)
818
+
819
+ X_tr_df = X_fold.iloc[tr_idx]
820
+ X_va_df = X_fold.iloc[va_idx]
821
+ y_tr = y[tr_idx]
822
+ y_va = y[va_idx]
823
+
824
+ # CatBoost: native NaN
825
+ cb = CatBoostClassifier(
826
+ iterations=1500, depth=6, learning_rate=0.03,
827
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
828
+ cat_features=cat_idx_f, eval_metric='AUC',
829
+ early_stopping_rounds=100, min_data_in_leaf=10)
830
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
831
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
832
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
833
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
834
+ cb_oof[va_idx] = cb_pred
835
+
836
+ if fold == N_FOLDS - 1:
837
+ all_fi.append(cb.get_feature_importance())
838
+ del cb, pool_tr, pool_va; gc.collect()
839
+
840
+ # LGB/XGB: fill NaN
841
+ X_tr_filled = X_tr_df.fillna(-999).values
842
+ X_va_filled = X_va_df.fillna(-999).values
843
+
844
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
845
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
846
+ lgb_params = {
847
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
848
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
849
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
850
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
851
+ 'seed': seed
852
+ }
853
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
854
+ valid_sets=[lgb_va_ds],
855
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
856
+ lgb_pred = lgb_model.predict(X_va_filled)
857
+ lgb_oof[va_idx] = lgb_pred
858
+ del lgb_model; gc.collect()
859
+
860
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
861
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
862
+ xgb_params = {
863
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
864
+ 'max_depth': 6, 'learning_rate': 0.03,
865
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
866
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
867
+ 'min_child_weight': 5,
868
+ 'seed': seed, 'verbosity': 0
869
+ }
870
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
871
+ evals=[(dval, 'val')],
872
+ early_stopping_rounds=100, verbose_eval=False)
873
+ xgb_pred = xgb_model.predict(dval)
874
+ xgb_oof[va_idx] = xgb_pred
875
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
876
+
877
+ if (fold + 1) % 5 == 0:
878
+ print(f" Fold {fold+1}/{N_FOLDS} done")
879
+
880
+ cb_auc = roc_auc_score(y, cb_oof)
881
+ lgb_auc = roc_auc_score(y, lgb_oof)
882
+ xgb_auc = roc_auc_score(y, xgb_oof)
883
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
884
+
885
+ all_cb_oof.append(cb_oof)
886
+ all_lgb_oof.append(lgb_oof)
887
+ all_xgb_oof.append(xgb_oof)
888
+
889
+ # ============================================================
890
+ # 10. ENSEMBLE & BLEND
891
+ # ============================================================
892
+ print(f"\n{'='*70}")
893
+ print(f" ENSEMBLE RESULTS")
894
+ print(f"{'='*70}")
895
+
896
+ cb_avg = np.mean(all_cb_oof, axis=0)
897
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
898
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
899
+
900
+ cb_final_auc = roc_auc_score(y, cb_avg)
901
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
902
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
903
+
904
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
905
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
906
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
907
+
908
+ best_auc = 0
909
+ best_weights = (0.45, 0.20, 0.35)
910
+ for w_cb in np.arange(0.2, 0.7, 0.05):
911
+ for w_lgb in np.arange(0.05, 0.5, 0.05):
912
+ w_xgb = 1.0 - w_cb - w_lgb
913
+ if w_xgb < 0.05: continue
914
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
915
+ auc = roc_auc_score(y, blend)
916
+ if auc > best_auc:
917
+ best_auc = auc
918
+ best_weights = (w_cb, w_lgb, w_xgb)
919
+
920
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
921
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
922
+ print(f" Delta vs V38.2-PRO-V2: {best_auc - 0.8722:+.4f}")
923
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
924
+
925
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
926
+ rank_auc = roc_auc_score(y, rank_blend)
927
+ print(f" Rank blend: {rank_auc:.4f}")
928
+
929
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
930
+ final_auc = roc_auc_score(y, final_blend_prob)
931
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
932
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
933
+
934
+ print(f"\n FINAL METRICS:")
935
+ print(f" AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-PRO-V2: 0.8722)")
936
+ print(f" Brier: {final_brier:.4f}")
937
+ print(f" LogLoss: {final_logloss:.4f}")
938
+
939
+ # ============================================================
940
+ # 11. FEATURE IMPORTANCE
941
+ # ============================================================
942
+ print(f"\n{'='*70}")
943
+ print(f" FEATURE IMPORTANCE (avg across seeds)")
944
+ print(f"{'='*70}")
945
+
946
+ if feature_cols_final and all_fi:
947
+ avg_fi = np.mean(all_fi, axis=0)
948
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
949
+
950
+ print(f" {'Rank':<5s} {'Feature':<50s} {'Importance':>10s}")
951
+ print(f" {'-'*5} {'-'*50} {'-'*10}")
952
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
953
+ marker = ""
954
+ if '_resid' in fname: marker = " [RESID]"
955
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [INTERACT]"
956
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
957
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
958
+ elif 'ed_boost' in fname or 'ed2_boost' in fname: marker = " [ED_BOOST]"
959
+ elif fname.startswith('has_'): marker = " [FLAG]"
960
+ print(f" {i+1:<5d} {fname:<50s} {imp:>10.2f}{marker}")
961
+
962
+ resid_in_top30 = sum(1 for f, _ in fi_pairs[:30]
963
+ if '_resid' in f or '_x_school_rate' in f or '_school_pctile' in f or 'school_base_rate' in f)
964
+ print(f"\n Residualized/interaction features in top 30: {resid_in_top30}")
965
+
966
+ # ============================================================
967
+ # 12. SAVE RESULTS
968
+ # ============================================================
969
+ elapsed = time.time() - start_time
970
+
971
+ results = {
972
+ 'version': 'V38.2-pro-v3',
973
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
974
+ 'elapsed_minutes': elapsed / 60,
975
+ 'changes': [
976
+ 'Use v5 feature matrix (GPA fixed: 40.3% coverage)',
977
+ 'SAT=0 -> NaN + has_sat flag',
978
+ 'TOEFL=0 -> NaN + has_toefl flag',
979
+ 'GPA=0 -> NaN + has_gpa flag',
980
+ '-1 -> NaN for sentinel columns',
981
+ 'Residualization uses only non-NaN values',
982
+ 'CatBoost native NaN handling',
983
+ 'LGB/XGB use -999 for NaN',
984
+ 'Percentile returns NaN instead of 0.5 for missing',
985
+ ],
986
+ 'comparison': {
987
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
988
+ 'v38_2_pro_v2': {'auc': 0.8722, 'temporal_auc': 0.8469},
989
+ },
990
+ 'temporal_validation': {
991
+ 'per_seed': temporal_results,
992
+ 'avg_blend': float(avg_temporal),
993
+ },
994
+ 'groupkfold': {
995
+ 'best_3model_blend': float(best_auc),
996
+ 'best_weights': [float(w) for w in best_weights],
997
+ 'rank_blend': float(rank_auc),
998
+ },
999
+ 'final_metrics': {
1000
+ 'auc': float(final_auc),
1001
+ 'brier': float(final_brier),
1002
+ 'logloss': float(final_logloss),
1003
+ },
1004
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1005
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1006
+ }
1007
+
1008
+ with open(os.path.join(OUTPUT_DIR, 'v38_2_pro_v3_results.json'), 'w') as f:
1009
+ json.dump(results, f, indent=2)
1010
+
1011
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1012
+ oof_df['cb_pred'] = cb_avg
1013
+ oof_df['lgb_pred'] = lgb_avg
1014
+ oof_df['xgb_pred'] = xgb_avg
1015
+ oof_df['final_pred'] = final_blend_prob
1016
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_v3_oof_predictions.csv'), index=False)
1017
+
1018
+ print(f"\n{'='*70}")
1019
+ print(f" V38.2-PRO-V3 COMPLETE")
1020
+ print(f" Total time: {elapsed/60:.1f} minutes")
1021
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1022
+ print(f" GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-PRO-V2: 0.8722)")
1023
+ print(f" Temporal AUC: {avg_temporal:.4f} (V37.3: 0.8410, V38.2-PRO-V2: 0.8469)")
1024
+ print(f"{'='*70}")