catninja123 commited on
Commit
eec377f
·
verified ·
1 Parent(s): dc01663

Upload train_v38_2_pro_v10.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v10.py +1375 -0
train_v38_2_pro_v10.py ADDED
@@ -0,0 +1,1375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V10 MODEL - Expanded Supp + PS V5 + Activity Labels
4
+ ====================================================================
5
+ Carries forward all V9 features, PLUS:
6
+ NEW #20: Expanded Supp features (42 cols: row-level, student-avg, student-max, binary)
7
+ NEW #21: All features pre-computed in V10 CSV (no JSON loading for supp)
8
+
9
+ ABLATION EXPERIMENT (controlled by EXPERIMENT_MODE):
10
+ "A" = Full model with all features
11
+ "B" = Full model - act_bert_pca (replace with labels)
12
+ "C" = Baseline (no new labels, control)
13
+ ====================================================================
14
+ """
15
+ import pandas as pd
16
+ import numpy as np
17
+ import json, os, warnings, sys, time, pickle, gc
18
+ warnings.filterwarnings('ignore')
19
+ from sklearn.model_selection import GroupKFold
20
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
21
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
22
+ from sklearn.decomposition import PCA
23
+ from scipy.stats import rankdata
24
+
25
+ try:
26
+ from catboost import CatBoostClassifier, Pool
27
+ import lightgbm as lgb
28
+ import xgboost as xgb
29
+ print("All model libraries loaded successfully")
30
+ except ImportError as e:
31
+ print(f"Missing library: {e}")
32
+ import subprocess
33
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
34
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
35
+ from catboost import CatBoostClassifier, Pool
36
+ import lightgbm as lgb
37
+ import xgboost as xgb
38
+
39
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
40
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
41
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
42
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
43
+
44
+ TARGET = 'target'
45
+ SEEDS = [42, 123, 456, 789, 2024]
46
+ N_FOLDS = 10
47
+ FEATURE_SELECT_TOP_N = 150
48
+ start_time = time.time()
49
+
50
+ # ============================================================
51
+ # EXPERIMENT MODE - controls ablation variant
52
+ # ============================================================
53
+ # "A" = V8 + new labels (keep act_bert_pca)
54
+ # "B" = V8 + new labels - act_bert_pca (replace BERT with labels)
55
+ # "C" = V8 baseline (no new labels, keep BERT) = control
56
+ EXPERIMENT_MODE = os.environ.get('V9_MODE', 'A')
57
+ ABLATE_PS_BERT = False # Keep PS BERT for now (separate concern)
58
+
59
+ # Activity LLM label columns
60
+ ACT_LABEL_COLS = [
61
+ 'activity_uniqueness', 'impact_quantifiability', 'academic_depth',
62
+ 'social_impact_depth', 'institutional_prestige', 'activity_diversity',
63
+ 'entrepreneurial_initiative', 'cross_activity_synergy',
64
+ 'intellectual_generosity', 'writing_craft', 'personal_voice',
65
+ 'info_architecture', 'tone_calibration'
66
+ ]
67
+ N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
68
+
69
+ # Supp V2 expanded columns (pre-computed in V10 CSV)
70
+ SUPP_ROW_COLS = [
71
+ 'supp_school_specific_program_references', 'supp_school_specific_faculty_mentions',
72
+ 'supp_school_specific_campus_features', 'supp_prompt_specific_alignment',
73
+ 'supp_personal_connection_to_school', 'supp_intellectual_engagement_depth',
74
+ 'supp_extracurricular_alignment', 'supp_values_alignment_with_school',
75
+ 'supp_specific_future_contribution', 'supp_unique_personal_context', 'supp_composite',
76
+ ]
77
+ SUPP_STUDENT_AVG_COLS = [
78
+ 'supp_avg_school_specific_program_references', 'supp_avg_school_specific_faculty_mentions',
79
+ 'supp_avg_school_specific_campus_features', 'supp_avg_prompt_specific_alignment',
80
+ 'supp_avg_personal_connection_to_school', 'supp_avg_intellectual_engagement_depth',
81
+ 'supp_avg_extracurricular_alignment', 'supp_avg_values_alignment_with_school',
82
+ 'supp_avg_specific_future_contribution', 'supp_avg_unique_personal_context',
83
+ ]
84
+ SUPP_STUDENT_MAX_COLS = [
85
+ 'supp_max_school_specific_program_references', 'supp_max_school_specific_faculty_mentions',
86
+ 'supp_max_school_specific_campus_features', 'supp_max_prompt_specific_alignment',
87
+ 'supp_max_personal_connection_to_school', 'supp_max_intellectual_engagement_depth',
88
+ 'supp_max_extracurricular_alignment', 'supp_max_values_alignment_with_school',
89
+ 'supp_max_specific_future_contribution', 'supp_max_unique_personal_context',
90
+ ]
91
+ SUPP_STUDENT_AGG_COLS = ['supp_student_avg_composite', 'supp_student_max_composite', 'supp_student_std_composite', 'supp_n_scored']
92
+ SUPP_BINARY_COLS = ['supp_has_campus_feature', 'supp_has_faculty_mention', 'supp_has_future_contribution',
93
+ 'supp_has_personal_connection', 'supp_has_program_ref', 'supp_has_strong_supp', 'supp_high_specificity']
94
+ SUPP_ALL_COLS = SUPP_ROW_COLS + SUPP_STUDENT_AVG_COLS + SUPP_STUDENT_MAX_COLS + SUPP_STUDENT_AGG_COLS + SUPP_BINARY_COLS
95
+
96
+ def safe_num(v, default=np.nan):
97
+ if isinstance(v, (int, float)):
98
+ val = float(v)
99
+ return np.nan if val == -1 else val
100
+ if isinstance(v, str):
101
+ try:
102
+ val = float(v)
103
+ return np.nan if val == -1 else val
104
+ except:
105
+ return default
106
+ return default
107
+
108
+ # ============================================================
109
+ # 1. LOAD DATA
110
+ # ============================================================
111
+ print("=" * 70)
112
+ print(f" V38.2-PRO-V9: ACTIVITY LLM LABELS + ABLATION")
113
+ print(f" EXPERIMENT MODE = {EXPERIMENT_MODE}")
114
+ print("=" * 70)
115
+ mode_desc = {
116
+ 'A': 'V8 + 13 new labels (keep BERT)',
117
+ 'B': 'V8 + 13 new labels - act_bert_pca (replace)',
118
+ 'C': 'V8 baseline (no new labels, control)',
119
+ }
120
+ print(f" Mode description: {mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN')}")
121
+
122
+ # Load main feature matrix (V10 includes PS V5 + expanded Supp + act labels)
123
+ v10_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v10.csv')
124
+ v9_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v9.csv')
125
+ v8_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v8.csv')
126
+ if os.path.exists(v10_path):
127
+ df_raw = pd.read_csv(v10_path)
128
+ print(f"V10 features loaded (PS V5 + expanded Supp): {df_raw.shape}")
129
+ elif os.path.exists(v9_path):
130
+ df_raw = pd.read_csv(v9_path)
131
+ print(f"V9 features loaded: {df_raw.shape}")
132
+ elif os.path.exists(v8_path):
133
+ df_raw = pd.read_csv(v8_path)
134
+ print(f"V8 features loaded: {df_raw.shape}")
135
+ else:
136
+ raise FileNotFoundError("No feature matrix found!")
137
+
138
+ # Load activity LLM labels
139
+ act_labels_path = os.path.join(DATA_DIR, 'act_labels_v2_results.csv')
140
+ act_labels_df = None
141
+ if EXPERIMENT_MODE in ['A', 'B'] and os.path.exists(act_labels_path):
142
+ act_labels_df = pd.read_csv(act_labels_path)
143
+ print(f"Activity LLM labels loaded: {act_labels_df.shape}")
144
+ print(f" Labels: {[c for c in act_labels_df.columns if c != 'student_id']}")
145
+ elif EXPERIMENT_MODE in ['A', 'B']:
146
+ print(f"WARNING: act_labels_v2_results.csv not found! Falling back to mode C")
147
+ EXPERIMENT_MODE = 'C'
148
+
149
+ # Load LLM features
150
+ llm_features_loaded = {}
151
+ for fname, varname in [
152
+ ('llm_activity_scores.json', 'act_scores'),
153
+ ('llm_supp_quality_all.json', 'supp_scores'),
154
+ ('llm_major_difficulty.json', 'major_diff'),
155
+ ('ps_yale_scores.json', 'ps_yale'),
156
+ ]:
157
+ fpath = os.path.join(DATA_DIR, fname)
158
+ if os.path.exists(fpath):
159
+ with open(fpath) as f:
160
+ llm_features_loaded[varname] = json.load(f)
161
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
162
+ else:
163
+ llm_features_loaded[varname] = {}
164
+
165
+ # Load raw data for ED2 round info
166
+ import re
167
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
168
+ round_lookup = {}
169
+ if os.path.exists(RAW_CSV):
170
+ print(f"\n Loading raw CSV for ED2 round info...")
171
+ try:
172
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
173
+ dtype=str, chunksize=500)
174
+ for chunk in raw_chunks:
175
+ for _, row in chunk.iterrows():
176
+ sid = str(row.get('student_id', '')).replace('.0', '')
177
+ summary = str(row.get('school_results_summary', ''))
178
+ entries = re.split(r'(?=\d+\.)', summary)
179
+ for entry in entries:
180
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
181
+ if m:
182
+ round_type = m.group(1)
183
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
184
+ if school_m:
185
+ school_name = school_m.group(1).strip()
186
+ key = f"{sid}_{school_name}"
187
+ round_lookup[key] = round_type
188
+ print(f" Round lookup built: {len(round_lookup)} entries")
189
+ except Exception as e:
190
+ print(f" Warning: Could not load raw CSV: {e}")
191
+
192
+ # ============================================================
193
+ # 2. MERGE ACTIVITY LLM LABELS INTO MAIN DATAFRAME
194
+ # ============================================================
195
+ if act_labels_df is not None and EXPERIMENT_MODE in ['A', 'B']:
196
+ # Merge on student_id
197
+ n_before = len(df_raw)
198
+ df_raw = df_raw.merge(act_labels_df, on='student_id', how='left')
199
+ assert len(df_raw) == n_before, f"Merge changed row count! {n_before} -> {len(df_raw)}"
200
+
201
+ n_with_labels = df_raw[ACT_LABEL_COLS[0]].notna().sum()
202
+ n_without = df_raw[ACT_LABEL_COLS[0]].isna().sum()
203
+ print(f"\n Activity labels merged: {n_with_labels} rows with labels, {n_without} without ({n_without/len(df_raw)*100:.1f}% NaN)")
204
+
205
+ # ============================================================
206
+ # 3. DATA CLEANING & QUALITY FIXES (same as V8)
207
+ # ============================================================
208
+ print(f"\n{'='*70}")
209
+ print(f" DATA QUALITY FIXES (V8 inherited)")
210
+ print(f"{'='*70}")
211
+
212
+ # Filter years
213
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy().reset_index(drop=True)
214
+ print(f"After filtering 2018-2019: {df.shape}")
215
+
216
+ # FIX #1: SAT=0 -> NaN
217
+ sat_zero = (df['sat'] == 0).sum()
218
+ df['has_sat'] = (df['sat'] > 0).astype(int)
219
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
220
+ print(f" FIX #1: SAT=0 -> NaN: {sat_zero} rows")
221
+
222
+ # FIX #2: TOEFL=0 -> NaN
223
+ toefl_zero = (df['toefl'] == 0).sum()
224
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
225
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
226
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows")
227
+
228
+ # FIX #3: GPA=0 -> NaN
229
+ gpa_zero = (df['gpa'] == 0).sum()
230
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
231
+ if 'has_gpa' not in df.columns:
232
+ df['has_gpa'] = df['gpa'].notna().astype(int)
233
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows")
234
+
235
+ # FIX #4: -1 -> NaN
236
+ sentinel_cols = ['taste_yearly_admits_log']
237
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
238
+ if col in df.columns:
239
+ sentinel_cols.append(col)
240
+ for col in sentinel_cols:
241
+ if col in df.columns:
242
+ n_neg1 = (df[col] == -1).sum()
243
+ df.loc[df[col] == -1, col] = np.nan
244
+ if n_neg1 > 0:
245
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows")
246
+
247
+ # FIX #5: ps_bert -> NaN for has_ps=0
248
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
249
+ no_ps_mask = df['has_ps'] == 0
250
+ for col in ps_bert_cols:
251
+ df.loc[no_ps_mask, col] = np.nan
252
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
253
+
254
+ # FIX #6: ps2_* -> NaN for has_ps=0
255
+ ps2_score_cols = [c for c in df.columns if c.startswith('ps2_') and c != 'ps2_essay_type']
256
+ for col in ps2_score_cols:
257
+ df.loc[no_ps_mask, col] = np.nan
258
+ print(f" FIX #6: ps2_* -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
259
+
260
+ # FIX #6b: Remove ps2_is_cliche_topic
261
+ if 'ps2_is_cliche_topic' in df.columns:
262
+ df.drop(columns=['ps2_is_cliche_topic'], inplace=True)
263
+
264
+ # FIX #16b (V9+PS_V5): ps5_* and ps_* V5 features -> NaN for has_ps=0
265
+ ps5_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
266
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
267
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
268
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
269
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
270
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
271
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
272
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
273
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
274
+ for col in ps5_cols:
275
+ if col in df.columns:
276
+ df.loc[no_ps_mask, col] = np.nan
277
+ print(f" FIX #16b (PS_V5): {len(ps5_cols)} ps5/ps_v5 features -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
278
+
279
+ # Ablation: Remove ps_bert if flag set
280
+ if ABLATE_PS_BERT and ps_bert_cols:
281
+ df.drop(columns=ps_bert_cols, inplace=True)
282
+ print(f" ABLATION: Removed {len(ps_bert_cols)} ps_bert_pca columns")
283
+
284
+ # FIX #8: ps_word_count -> NaN for has_ps=0
285
+ if 'ps_word_count' in df.columns:
286
+ df.loc[no_ps_mask, 'ps_word_count'] = np.nan
287
+
288
+ # FIX #9: ps_flag_* -> NaN for has_ps=0
289
+ ps_flag_cols = [c for c in df.columns if c.startswith('ps_flag_')]
290
+ for col in ps_flag_cols:
291
+ df.loc[no_ps_mask, col] = np.nan
292
+
293
+ # FIX #10: honors_* -> NaN for honors_count=0
294
+ no_honors_mask = df['honors_count'] == 0
295
+ honors_numeric_cols = [c for c in ['honors_max_score', 'honors_avg_score', 'honors_min_score',
296
+ 'honors_total_score', 'honors_quality_ratio',
297
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
298
+ 'honors_has_national'] if c in df.columns]
299
+ for col in honors_numeric_cols:
300
+ df.loc[no_honors_mask, col] = np.nan
301
+ df['has_honors'] = (df['honors_count'] > 0).astype(int)
302
+
303
+ # FIX #11: act_bert_pca_* -> NaN for act_total_count=0
304
+ no_act_mask = df['act_total_count'] == 0
305
+ act_bert_cols = [c for c in df.columns if c.startswith('act_bert_pca_')]
306
+
307
+ # V9 MODE B: Remove act_bert_pca entirely
308
+ if EXPERIMENT_MODE == 'B' and act_bert_cols:
309
+ df.drop(columns=act_bert_cols, inplace=True)
310
+ print(f" V9 MODE B: REMOVED {len(act_bert_cols)} act_bert_pca columns (replaced by LLM labels)")
311
+ act_bert_cols = []
312
+ else:
313
+ for col in act_bert_cols:
314
+ df.loc[no_act_mask, col] = np.nan
315
+ print(f" FIX #11: act_bert_pca -> NaN for act_total_count=0: {no_act_mask.sum()} rows")
316
+
317
+ # FIX #12: act_slot_pca_* -> NaN for act_total_count=0
318
+ act_slot_cols = [c for c in df.columns if c.startswith('act_slot_pca_')]
319
+ for col in act_slot_cols:
320
+ df.loc[no_act_mask, col] = np.nan
321
+
322
+ # FIX #13-14: cuilu -> NaN when cuilu_hs_total=0
323
+ no_cuilu_mask = df['cuilu_hs_total'] == 0
324
+ for col in ['cuilu_hs_to_univ', 'cuilu_hs_to_univ_pct', 'cuilu_hs_top10_rate',
325
+ 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
326
+ if col in df.columns:
327
+ df.loc[no_cuilu_mask, col] = np.nan
328
+
329
+ # FIX #15: Remove taste_yearly_admits_log
330
+ if 'taste_yearly_admits_log' in df.columns:
331
+ df.drop(columns=['taste_yearly_admits_log'], inplace=True)
332
+
333
+ df['has_act'] = (df['act_total_count'] > 0).astype(int)
334
+ df['has_cuilu'] = (df['cuilu_hs_total'] > 0).astype(int)
335
+
336
+ # V9 NEW #16: Set activity labels to NaN for act_total_count=0
337
+ if EXPERIMENT_MODE in ['A', 'B']:
338
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
339
+ for col in act_label_cols_in_df:
340
+ df.loc[no_act_mask, col] = np.nan
341
+ n_label_nan = no_act_mask.sum()
342
+ print(f" V9 NEW #16: Activity labels -> NaN for act_total_count=0: {n_label_nan} rows")
343
+
344
+ # Also create has_act_labels flag
345
+ df['has_act_labels'] = df[act_label_cols_in_df[0]].notna().astype(int)
346
+ n_with = df['has_act_labels'].sum()
347
+ print(f" has_act_labels=1: {n_with}, =0: {len(df)-n_with}")
348
+
349
+ # Create aggregate features from labels
350
+ df['act_label_mean'] = df[act_label_cols_in_df].mean(axis=1)
351
+ df['act_label_max'] = df[act_label_cols_in_df].max(axis=1)
352
+ df['act_label_min'] = df[act_label_cols_in_df].min(axis=1)
353
+ df['act_label_std'] = df[act_label_cols_in_df].std(axis=1)
354
+ df['act_label_range'] = df['act_label_max'] - df['act_label_min']
355
+ print(f" Created aggregate features: act_label_mean/max/min/std/range")
356
+
357
+ print(f"\n All V8 fixes applied. Shape: {df.shape}")
358
+
359
+ # Portfolio size transform
360
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
361
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
362
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
363
+ bins=[0, 5, 10, 15, 20, 100],
364
+ labels=[0, 1, 2, 3, 4]).astype(int)
365
+
366
+ # ED2 split
367
+ def get_detailed_round(row):
368
+ sid = str(row.get('student_id', '')).replace('.0', '')
369
+ school = str(row.get('school', ''))
370
+ key = f"{sid}_{school}"
371
+ raw_round = round_lookup.get(key, '')
372
+ if 'Early Decision II' in raw_round: return 'ED2'
373
+ elif 'Early Decision' in raw_round: return 'ED1'
374
+ elif 'Restrictive Early Action' in raw_round: return 'REA'
375
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round: return 'EA'
376
+ elif 'Regular Decision' in raw_round: return 'RD'
377
+ orig = str(row.get('round_cat', 'RD'))
378
+ if orig == 'ED': return 'ED1'
379
+ return orig
380
+
381
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
382
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
383
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
384
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
385
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
386
+ df['round_cat'] = df['round_cat_v2']
387
+
388
+ # ============================================================
389
+ # 3. PARSE LLM FEATURES (same as V8)
390
+ # ============================================================
391
+ act_scores = {}
392
+ raw = llm_features_loaded.get('act_scores', {})
393
+ if isinstance(raw, list):
394
+ for item in raw:
395
+ if isinstance(item, dict) and item.get('success', False):
396
+ sid_raw = str(item.get('student_id', ''))
397
+ act_scores[sid_raw] = item
398
+ parts = sid_raw.split('_')
399
+ for p in parts:
400
+ clean = p.replace('.0', '')
401
+ if clean.isdigit():
402
+ act_scores[clean] = item
403
+ elif isinstance(raw, dict):
404
+ for sid, scores in raw.items():
405
+ if isinstance(scores, dict):
406
+ act_scores[sid] = scores
407
+
408
+ supp_scores = {}
409
+ raw = llm_features_loaded.get('supp_scores', {})
410
+ if isinstance(raw, list):
411
+ for item in raw:
412
+ if isinstance(item, dict) and item.get('success', False):
413
+ sid = str(item.get('student_id', '')).replace('.0', '')
414
+ school = str(item.get('school', ''))
415
+ key = f"{sid}_{school}"
416
+ oq = item.get('overall_quality', 0)
417
+ if isinstance(oq, (int, float)) and oq <= 1:
418
+ continue
419
+ supp_scores[key] = item
420
+ elif isinstance(raw, dict):
421
+ for key, scores in raw.items():
422
+ if isinstance(scores, dict):
423
+ oq = scores.get('overall_quality', 0)
424
+ if isinstance(oq, (int, float)) and oq <= 1:
425
+ continue
426
+ supp_scores[key] = scores
427
+
428
+ major_diff = llm_features_loaded.get('major_diff', {})
429
+ if isinstance(major_diff, list):
430
+ major_diff = {}
431
+
432
+ ps_yale = {}
433
+ raw = llm_features_loaded.get('ps_yale', {})
434
+ if isinstance(raw, list):
435
+ for item in raw:
436
+ if isinstance(item, dict):
437
+ sid = str(item.get('student_id', '')).replace('.0', '')
438
+ ps_yale[sid] = item
439
+ elif isinstance(raw, dict):
440
+ ps_yale = raw
441
+
442
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
443
+
444
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
445
+ 'n_founder', 'n_president', 'max_scope',
446
+ 'has_publication', 'has_patent', 'has_summer_program',
447
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
448
+ 'activity_coherence', 'spike_strength']
449
+
450
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
451
+ 'has_imagination_scene', 'mentions_specific_course',
452
+ 'mentions_specific_professor', 'mentions_specific_program',
453
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
454
+
455
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
456
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
457
+ and not k.startswith('is_')]
458
+ if not PS_DIMS:
459
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
460
+ 'coherence_focus', 'overall_effectiveness']
461
+
462
+ # ============================================================
463
+ # 4. DEFINE FEATURE GROUPS
464
+ # ============================================================
465
+ STUDENT_LEVEL_NUMERIC = [
466
+ 'toefl', 'sat', 'gpa',
467
+ 'act_total_count', 'act_type_diversity',
468
+ *[f'act_slot_pca_{i}' for i in range(20)],
469
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
470
+ 'honors_count', 'honors_total_score',
471
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
472
+ 'honors_has_national', 'honors_quality_ratio',
473
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
474
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
475
+ 'cuilu_hs_total',
476
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
477
+ 'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
478
+ 'hs_overall_hist_rate',
479
+ 'summer_max_geili', 'summer_has_elite', 'summer_count',
480
+ 'summer_program_count', 'summer_difficulty_max',
481
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice', 'ps2_overall', 'ps2_mean',
482
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
483
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
484
+ 'has_honors', 'has_act', 'has_cuilu',
485
+ ]
486
+
487
+ # Conditionally include act_bert_pca (Mode A keeps, Mode B removes)
488
+ if EXPERIMENT_MODE != 'B':
489
+ STUDENT_LEVEL_NUMERIC.extend([f'act_bert_pca_{i}' for i in range(16)])
490
+
491
+ # Conditionally include ps_bert_pca
492
+ if not ABLATE_PS_BERT:
493
+ STUDENT_LEVEL_NUMERIC.extend([f'ps_bert_pca_{i}' for i in range(16)])
494
+
495
+ # V9: Include activity LLM labels
496
+ if EXPERIMENT_MODE in ['A', 'B']:
497
+ STUDENT_LEVEL_NUMERIC.extend(ACT_LABEL_COLS)
498
+ STUDENT_LEVEL_NUMERIC.extend(['act_label_mean', 'act_label_max', 'act_label_min',
499
+ 'act_label_std', 'act_label_range', 'has_act_labels'])
500
+
501
+ # V9+PS_V5: Include PS V5 hybrid features (LLM extraction + programmatic)
502
+ ps5_feature_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
503
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
504
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
505
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
506
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
507
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
508
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
509
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
510
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
511
+ STUDENT_LEVEL_NUMERIC.extend(ps5_feature_cols)
512
+ print(f" PS V5 hybrid features added: {len(ps5_feature_cols)} columns")
513
+
514
+ # PS-related features for special school_mean handling
515
+ PS_RELATED_FEATURES = set([
516
+ *[f'ps_bert_pca_{i}' for i in range(16)],
517
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice',
518
+ 'ps2_overall', 'ps2_mean',
519
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
520
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
521
+ 'ps_word_count',
522
+ ])
523
+ # V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
524
+ PS_RELATED_FEATURES.update(set(ps5_feature_cols))
525
+
526
+ # V10: Include expanded Supp features (pre-computed in CSV)
527
+ supp_in_data = [c for c in SUPP_ALL_COLS if c in df.columns]
528
+ STUDENT_LEVEL_NUMERIC.extend(supp_in_data)
529
+ print(f" Supp V2 expanded features added: {len(supp_in_data)} columns")
530
+ # Row-level supp features are school-level (student x school)
531
+ # Student-level supp aggregates capture overall supp writing abilityn
532
+
533
+ # V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
534
+ ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
535
+ 'act_label_std', 'act_label_range'])
536
+
537
+ # Add act_type_count columns
538
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
539
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
540
+
541
+ # Filter to existing
542
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
543
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
544
+
545
+ KEY_STUDENT_FEATURES = [
546
+ 'toefl', 'sat', 'gpa',
547
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
548
+ 'honors_quality_ratio',
549
+ 'act_type_diversity', 'act_total_count',
550
+ 'hs_to_univ_hist_rate_smoothed',
551
+ 'summer_max_geili',
552
+ 'ps2_overall', 'ps2_character_revelation', 'ps2_craft_voice',
553
+ ]
554
+
555
+ # V9: Add top activity labels to key features for interactions
556
+ if EXPERIMENT_MODE in ['A', 'B']:
557
+ KEY_STUDENT_FEATURES.extend(['act_label_mean', 'social_impact_depth',
558
+ 'tone_calibration', 'academic_depth'])
559
+
560
+ LLM_INTERACTION_FEATURES = [
561
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
562
+ 'supp_mean', 'supp_max', 'supp_composite', 'supp_student_avg_composite',
563
+ 'ps_mean', 'major_difficulty',
564
+ 'ps2_mean', 'ps2_overall',
565
+ ]
566
+
567
+ # ============================================================
568
+ # 5. BUILD FEATURES
569
+ # ============================================================
570
+ def build_features_base(df):
571
+ df = df.copy()
572
+
573
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
574
+ df['year_cat'] = df['year'].astype(str)
575
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
576
+
577
+ # LLM Activity features
578
+ for dim in ACT_DIMS:
579
+ col_name = f'llm_act_{dim}'
580
+ df[col_name] = df['sid_str'].map(
581
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
582
+
583
+ # LLM Supp features - V10: already pre-computed in CSV, just compute aggregates
584
+ # supp_composite and row-level scores are already in the dataframe
585
+
586
+ # Major difficulty
587
+ def get_major_diff(row):
588
+ key = f"{row['school']}_{row['major_cat']}"
589
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
590
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
591
+
592
+ # PS Yale scores
593
+ for dim in PS_DIMS:
594
+ col_name = f'ps_{dim}'
595
+ df[col_name] = df['sid_str'].map(
596
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
597
+
598
+ # Aggregates
599
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
600
+ valid_act = df[llm_act_cols]
601
+ df['llm_act_mean'] = valid_act.mean(axis=1)
602
+ df['llm_act_max'] = valid_act.max(axis=1)
603
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
604
+
605
+ # V10: Use pre-computed supp_composite as supp_mean, and compute supp_max from row-level scores
606
+ supp_row_in_df = [c for c in SUPP_ROW_COLS if c in df.columns and c != 'supp_composite']
607
+ if supp_row_in_df:
608
+ valid_supp = df[supp_row_in_df]
609
+ df['supp_mean'] = valid_supp.mean(axis=1)
610
+ df['supp_max'] = valid_supp.max(axis=1)
611
+ elif 'supp_composite' in df.columns:
612
+ df['supp_mean'] = df['supp_composite']
613
+ df['supp_max'] = df['supp_composite']
614
+ else:
615
+ df['supp_mean'] = np.nan
616
+ df['supp_max'] = np.nan
617
+
618
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
619
+ valid_ps = df[ps_cols]
620
+ df['ps_mean'] = valid_ps.mean(axis=1)
621
+
622
+ # Basic interactions
623
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
624
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
625
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
626
+
627
+ if 'honors_avg_score' in df.columns:
628
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
629
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
630
+
631
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
632
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
633
+
634
+ # V9: Activity label interactions
635
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
636
+ df['act_label_x_supp'] = df['act_label_mean'] * df['supp_mean']
637
+ df['act_label_x_llm_act'] = df['act_label_mean'] * df['llm_act_mean']
638
+ if 'ps2_mean' in df.columns:
639
+ df['act_label_x_ps2'] = df['act_label_mean'] * df['ps2_mean']
640
+ print(f" V9: Created activity label interaction features")
641
+
642
+ # Categoricals
643
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
644
+ cat_cols = [c for c in cat_cols if c in df.columns]
645
+
646
+ if 'round_cat' in df.columns:
647
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
648
+ cat_cols.append('school_round')
649
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
650
+ cat_cols.append('school_major')
651
+ if 'hs_cat' in df.columns:
652
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
653
+ cat_cols.append('school_hstype')
654
+
655
+ for c in cat_cols:
656
+ df[c] = df[c].fillna('_MISSING_').astype(str)
657
+ le = LabelEncoder()
658
+ df[c] = le.fit_transform(df[c]).astype(int)
659
+
660
+ return df, cat_cols
661
+
662
+
663
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
664
+ df = df.copy()
665
+
666
+ train_df = df[train_mask]
667
+ global_rate = train_df[TARGET].mean()
668
+
669
+ school_stats = train_df.groupby('school').agg(
670
+ school_raw_rate=(TARGET, 'mean'),
671
+ school_n_apps=(TARGET, 'count'),
672
+ school_n_admits=(TARGET, 'sum'),
673
+ ).reset_index()
674
+
675
+ SMOOTH_STRENGTH = 30
676
+ school_stats['school_base_rate'] = (
677
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
678
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
679
+ )
680
+
681
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
682
+ on='school', how='left')
683
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
684
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
685
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
686
+
687
+ # ED boost
688
+ ed1_mask = train_df['is_ed1'] == 1
689
+ rd_mask = train_df['is_early'] == 0
690
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
691
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
692
+
693
+ ed_boost_map = {}
694
+ for school in ed1_school_rates.index:
695
+ if school in rd_school_rates.index:
696
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
697
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
698
+
699
+ ed2_mask = train_df['is_ed2'] == 1
700
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
701
+ ed2_boost_map = {}
702
+ for school in ed2_school_rates.index:
703
+ if school in rd_school_rates.index:
704
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
705
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
706
+
707
+ # Residualize student features
708
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
709
+ train_has_ps = train_df[train_df['has_ps'] == 1]
710
+
711
+ # V9: Pre-compute has_act_labels=1 subset for activity label features
712
+ if 'has_act_labels' in train_df.columns:
713
+ train_has_act_labels = train_df[train_df['has_act_labels'] == 1]
714
+ else:
715
+ train_has_act_labels = train_df
716
+
717
+ resid_cols = []
718
+ for col in student_feat_available:
719
+ resid_col = f'{col}_resid'
720
+
721
+ if col in PS_RELATED_FEATURES:
722
+ school_mean_series = train_has_ps.groupby('school')[col].mean()
723
+ elif col in ACT_LABEL_FEATURES:
724
+ # V9: Use has_act_labels=1 subset for activity label features
725
+ school_mean_series = train_has_act_labels.groupby('school')[col].mean()
726
+ elif col.startswith('honors_') and col != 'honors_count':
727
+ train_has_honors = train_df[train_df['honors_count'] > 0]
728
+ school_mean_series = train_has_honors.groupby('school')[col].mean()
729
+ elif col.startswith('act_bert_pca_') or col.startswith('act_slot_pca_'):
730
+ train_has_act = train_df[train_df['act_total_count'] > 0]
731
+ school_mean_series = train_has_act.groupby('school')[col].mean()
732
+ elif col.startswith('cuilu_hs_to_univ') or col in ['cuilu_hs_top10_rate', 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
733
+ train_has_cuilu = train_df[train_df['cuilu_hs_total'] > 0]
734
+ school_mean_series = train_has_cuilu.groupby('school')[col].mean()
735
+ else:
736
+ school_mean_series = train_df.groupby('school')[col].mean()
737
+
738
+ col_school_mean = df['school'].map(school_mean_series)
739
+ df[resid_col] = df[col] - col_school_mean
740
+ resid_cols.append(resid_col)
741
+
742
+ # V9 NEW #17: Activity label PCA (reduce multicollinearity)
743
+ act_label_pca_cols = []
744
+ if EXPERIMENT_MODE in ['A', 'B']:
745
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
746
+ if act_label_cols_in_df:
747
+ # Fit PCA on training data only
748
+ train_label_data = train_df[act_label_cols_in_df].dropna()
749
+ if len(train_label_data) > N_ACT_LABEL_PCA * 2:
750
+ scaler = StandardScaler()
751
+ pca = PCA(n_components=N_ACT_LABEL_PCA, random_state=42)
752
+
753
+ train_scaled = scaler.fit_transform(train_label_data)
754
+ pca.fit(train_scaled)
755
+
756
+ # Transform all data
757
+ all_label_data = df[act_label_cols_in_df].copy()
758
+ # Fill NaN with column mean for PCA transform, then set back to NaN
759
+ has_any_label = all_label_data.notna().any(axis=1)
760
+ fill_means = train_label_data.mean()
761
+ all_label_filled = all_label_data.fillna(fill_means)
762
+ all_scaled = scaler.transform(all_label_filled)
763
+ pca_result = pca.transform(all_scaled)
764
+
765
+ for i in range(N_ACT_LABEL_PCA):
766
+ col_name = f'act_label_pca_{i}'
767
+ df[col_name] = pca_result[:, i]
768
+ # Set to NaN where original labels were all NaN
769
+ df.loc[~has_any_label, col_name] = np.nan
770
+ act_label_pca_cols.append(col_name)
771
+
772
+ var_explained = pca.explained_variance_ratio_
773
+ print(f" V9 NEW #17: Activity label PCA: {len(act_label_cols_in_df)} -> {N_ACT_LABEL_PCA} components")
774
+ print(f" Variance explained: {var_explained.sum():.3f} ({', '.join(f'{v:.3f}' for v in var_explained)})")
775
+
776
+ # ps2_mean_school_pctile
777
+ pctile_ps_cols = []
778
+ if 'ps2_mean' in df.columns:
779
+ ps_pctile_col = 'ps2_mean_school_pctile'
780
+ school_ps_distributions = {}
781
+ for school_id in train_has_ps['school'].unique():
782
+ vals = train_has_ps[train_has_ps['school'] == school_id]['ps2_mean'].dropna().values
783
+ if len(vals) > 2:
784
+ school_ps_distributions[school_id] = vals
785
+
786
+ def compute_ps_pctile(row, sd=school_ps_distributions):
787
+ school_id = row['school']
788
+ val = row['ps2_mean']
789
+ if pd.isna(val) or school_id not in sd:
790
+ return np.nan
791
+ return np.mean(sd[school_id] <= val)
792
+
793
+ df[ps_pctile_col] = df.apply(compute_ps_pctile, axis=1)
794
+ pctile_ps_cols.append(ps_pctile_col)
795
+
796
+ # V9 NEW #18: Activity label school percentile
797
+ act_label_pctile_cols = []
798
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
799
+ al_pctile_col = 'act_label_mean_school_pctile'
800
+ school_al_distributions = {}
801
+ for school_id in train_has_act_labels['school'].unique():
802
+ vals = train_has_act_labels[train_has_act_labels['school'] == school_id]['act_label_mean'].dropna().values
803
+ if len(vals) > 2:
804
+ school_al_distributions[school_id] = vals
805
+
806
+ def compute_al_pctile(row, sd=school_al_distributions):
807
+ school_id = row['school']
808
+ val = row['act_label_mean']
809
+ if pd.isna(val) or school_id not in sd:
810
+ return np.nan
811
+ return np.mean(sd[school_id] <= val)
812
+
813
+ df[al_pctile_col] = df.apply(compute_al_pctile, axis=1)
814
+ act_label_pctile_cols.append(al_pctile_col)
815
+ n_valid = df[al_pctile_col].notna().sum()
816
+ print(f" V9 NEW #18: {al_pctile_col}: {n_valid} valid values")
817
+
818
+ # Interactions
819
+ interaction_cols = []
820
+ for col in KEY_STUDENT_FEATURES:
821
+ if col in df.columns:
822
+ int_col = f'{col}_x_school_rate'
823
+ df[int_col] = df[col] * df['school_base_rate']
824
+ interaction_cols.append(int_col)
825
+
826
+ resid_col = f'{col}_resid'
827
+ if resid_col in df.columns:
828
+ int_resid_col = f'{col}_resid_x_rate'
829
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
830
+ interaction_cols.append(int_resid_col)
831
+
832
+ for col in LLM_INTERACTION_FEATURES:
833
+ if col in df.columns:
834
+ int_col = f'{col}_x_school_rate'
835
+ df[int_col] = df[col] * df['school_base_rate']
836
+ interaction_cols.append(int_col)
837
+
838
+ if 'portfolio_size' in df.columns:
839
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
840
+ interaction_cols.append('portfolio_x_school_rate')
841
+
842
+ if 'is_ed1' in df.columns:
843
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
844
+ interaction_cols.append('ed1_x_ed_boost')
845
+ if 'is_ed2' in df.columns:
846
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
847
+ interaction_cols.append('ed2_x_ed2_boost')
848
+
849
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
850
+ if flag in df.columns:
851
+ int_col = f'{flag}_x_school_rate'
852
+ df[int_col] = df[flag] * df['school_base_rate']
853
+ interaction_cols.append(int_col)
854
+
855
+ if 'ps2_mean_school_pctile' in df.columns:
856
+ df['ps2_pctile_x_school_rate'] = df['ps2_mean_school_pctile'] * df['school_base_rate']
857
+ interaction_cols.append('ps2_pctile_x_school_rate')
858
+
859
+ # V9 NEW #19: Activity label percentile x school_base_rate
860
+ if 'act_label_mean_school_pctile' in df.columns:
861
+ df['act_label_pctile_x_school_rate'] = df['act_label_mean_school_pctile'] * df['school_base_rate']
862
+ interaction_cols.append('act_label_pctile_x_school_rate')
863
+
864
+ # Student percentile within school
865
+ pctile_cols = []
866
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score', 'llm_act_mean', 'supp_mean']:
867
+ if col not in df.columns:
868
+ continue
869
+ pctile_col = f'{col}_school_pctile'
870
+ school_distributions = {}
871
+ for school_id in train_df['school'].unique():
872
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
873
+ if len(vals) > 2:
874
+ school_distributions[school_id] = vals
875
+
876
+ def compute_pctile(row, col=col, sd=school_distributions):
877
+ school_id = row['school']
878
+ val = row[col]
879
+ if pd.isna(val) or school_id not in sd:
880
+ return np.nan
881
+ return np.mean(sd[school_id] <= val)
882
+
883
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
884
+ pctile_cols.append(pctile_col)
885
+
886
+ pctile_cols.extend(pctile_ps_cols)
887
+ pctile_cols.extend(act_label_pctile_cols)
888
+
889
+ # Student competitiveness
890
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
891
+ components = []
892
+ weights = []
893
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
894
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
895
+ if col in df.columns:
896
+ components.append(df[col] / scale)
897
+ weights.append(w)
898
+ if components:
899
+ strength_df = pd.DataFrame(components).T
900
+ df['student_strength'] = strength_df.mean(axis=1)
901
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
902
+
903
+ # Build final feature list
904
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
905
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
906
+
907
+ all_feat = list(set(num_cols + cat_cols))
908
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
909
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
910
+ if remove in feature_cols:
911
+ feature_cols.remove(remove)
912
+
913
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
914
+ feature_cols = [c for c in feature_cols if c not in to_drop]
915
+
916
+ if selected_features is not None:
917
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
918
+ 'student_strength', 'strength_vs_school',
919
+ 'school_ed_boost', 'school_ed2_boost',
920
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
921
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
922
+ 'has_sat', 'has_toefl', 'has_gpa',
923
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate',
924
+ 'ps2_mean_school_pctile', 'ps2_pctile_x_school_rate',
925
+ 'has_honors', 'has_act', 'has_cuilu'}
926
+ # V9: Always keep activity label features
927
+ if EXPERIMENT_MODE in ['A', 'B']:
928
+ must_keep.update(set(act_label_pca_cols))
929
+ must_keep.update({'act_label_mean', 'act_label_max', 'act_label_std',
930
+ 'act_label_mean_school_pctile', 'act_label_pctile_x_school_rate',
931
+ 'has_act_labels', 'act_label_x_supp', 'act_label_x_llm_act'})
932
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
933
+
934
+ for c in feature_cols:
935
+ if df[c].dtype in ['float64', 'float32']:
936
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
937
+
938
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
939
+
940
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + len(act_label_pca_cols) + 5
941
+ print(f" Features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile + {len(act_label_pca_cols)} label_pca = total {len(feature_cols)}")
942
+
943
+ return df, feature_cols, cat_cols, cat_indices
944
+
945
+
946
+ # ============================================================
947
+ # 6. BUILD BASE FEATURES
948
+ # ============================================================
949
+ df_base, cat_cols = build_features_base(df)
950
+ print(f"\nBase features built. Shape: {df_base.shape}")
951
+
952
+ y = df_base[TARGET].values
953
+ groups = df_base['student_id'].values
954
+
955
+ # ============================================================
956
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
957
+ # ============================================================
958
+ print(f"\n{'='*70}")
959
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
960
+ print(f"{'='*70}")
961
+
962
+ stage1_fi = []
963
+ gkf_s1 = GroupKFold(n_splits=5)
964
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
965
+ train_mask = pd.Series(False, index=df_base.index)
966
+ train_mask.iloc[tr_idx] = True
967
+
968
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
969
+ df_base, train_mask, cat_cols)
970
+
971
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
972
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
973
+ y_tr = y[tr_idx]
974
+ y_va = y[va_idx]
975
+
976
+ for c in cat_cols_f:
977
+ if c in X_tr.columns:
978
+ X_tr[c] = X_tr[c].astype(int)
979
+ X_va[c] = X_va[c].astype(int)
980
+
981
+ cb = CatBoostClassifier(
982
+ iterations=500, depth=6, learning_rate=0.05,
983
+ l2_leaf_reg=7, random_seed=42, verbose=0,
984
+ cat_features=cat_idx_f, eval_metric='AUC',
985
+ early_stopping_rounds=50)
986
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
987
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
988
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
989
+
990
+ fi = cb.get_feature_importance()
991
+ stage1_fi.append(fi)
992
+
993
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
994
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
995
+
996
+ if fold == 0:
997
+ all_feature_names = feat_cols_f
998
+
999
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
1000
+
1001
+ avg_fi = np.mean(stage1_fi, axis=0)
1002
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
1003
+
1004
+ selected_set = set(cat_cols)
1005
+ n_added = 0
1006
+ for fname, imp in fi_pairs:
1007
+ if fname not in cat_cols:
1008
+ selected_set.add(fname)
1009
+ n_added += 1
1010
+ if n_added >= FEATURE_SELECT_TOP_N:
1011
+ break
1012
+
1013
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
1014
+ print(f" Top 30 features:")
1015
+ for i, (fname, imp) in enumerate(fi_pairs[:30]):
1016
+ marker = ""
1017
+ if 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1018
+ elif '_resid' in fname: marker = " [R]"
1019
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
1020
+ elif '_school_pctile' in fname: marker = " [P]"
1021
+ elif 'school_base_rate' in fname: marker = " [S]"
1022
+ elif 'ed_boost' in fname: marker = " [ED]"
1023
+ elif 'ps2_' in fname: marker = " [PS2]"
1024
+ elif 'ps5_' in fname or fname.startswith('ps_') and 'bert' not in fname: marker = " [PS_V5]"
1025
+ elif 'supp_' in fname: marker = " [SUPP_V2]"
1026
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1027
+ print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
1028
+
1029
+ # Count V9 new features in top 50
1030
+ v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
1031
+ ps5_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'ps5_' in f or (f.startswith('ps_') and 'bert' not in f and f in ps5_feature_cols))
1032
+ supp_v2_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'supp_' in f)
1033
+ print(f" PS V5 features in top 50: {ps5_in_top50}")
1034
+ print(f" Supp V2 features in top 50: {supp_v2_in_top50}")
1035
+ bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
1036
+ print(f"\n V9 activity label features in top 50: {v9_in_top50}")
1037
+ print(f" act_bert_pca features in top 50: {bert_in_top50}")
1038
+
1039
+ # ============================================================
1040
+ # 8. TEMPORAL VALIDATION
1041
+ # ============================================================
1042
+ print(f"\n{'='*70}")
1043
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024)")
1044
+ print(f"{'='*70}")
1045
+
1046
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
1047
+ mask_test_temporal = df_base['year'] == 2024
1048
+
1049
+ temporal_results = {}
1050
+ if mask_test_temporal.sum() > 0:
1051
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
1052
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
1053
+
1054
+ X_t = df_temporal[feat_cols_t].copy()
1055
+ for c in cat_cols_t:
1056
+ if c in X_t.columns:
1057
+ X_t[c] = X_t[c].astype(int)
1058
+
1059
+ X_tr_t = X_t[mask_train_temporal]
1060
+ X_te_t = X_t[mask_test_temporal]
1061
+ y_tr_t = y[mask_train_temporal]
1062
+ y_te_t = y[mask_test_temporal]
1063
+
1064
+ X_tr_t_filled = X_tr_t.fillna(-999)
1065
+ X_te_t_filled = X_te_t.fillna(-999)
1066
+
1067
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
1068
+
1069
+ for seed in SEEDS:
1070
+ cb_t = CatBoostClassifier(
1071
+ iterations=1000, depth=6, learning_rate=0.03,
1072
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
1073
+ cat_features=cat_idx_t, eval_metric='AUC',
1074
+ early_stopping_rounds=100, min_data_in_leaf=10)
1075
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
1076
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
1077
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
1078
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
1079
+ del cb_t; gc.collect()
1080
+
1081
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
1082
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
1083
+ lgb_params = {
1084
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1085
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
1086
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
1087
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
1088
+ 'seed': seed
1089
+ }
1090
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
1091
+ valid_sets=[lgb_va],
1092
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1093
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
1094
+ del lgb_model; gc.collect()
1095
+
1096
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
1097
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
1098
+ xgb_params = {
1099
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1100
+ 'max_depth': 6, 'learning_rate': 0.03,
1101
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
1102
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
1103
+ 'min_child_weight': 5,
1104
+ 'seed': seed, 'verbosity': 0
1105
+ }
1106
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
1107
+ evals=[(dtest, 'val')],
1108
+ early_stopping_rounds=100, verbose_eval=False)
1109
+ xgb_pred = xgb_model.predict(dtest)
1110
+ del xgb_model, dtrain, dtest; gc.collect()
1111
+
1112
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
1113
+ temporal_results[seed] = {
1114
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
1115
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
1116
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
1117
+ 'blend': float(roc_auc_score(y_te_t, blend))
1118
+ }
1119
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
1120
+
1121
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
1122
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f}")
1123
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
1124
+ print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
1125
+ print(f" Delta vs V38.2-PRO-V8: {avg_temporal - 0.8548:+.4f}")
1126
+ print(f" Delta vs V38.2-PRO-V9: {avg_temporal - 0.8594:+.4f}")
1127
+
1128
+ del df_temporal, X_t; gc.collect()
1129
+ else:
1130
+ avg_temporal = 0.0
1131
+
1132
+ # ============================================================
1133
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
1134
+ # ============================================================
1135
+ print(f"\n{'='*70}")
1136
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
1137
+ print(f"{'='*70}")
1138
+
1139
+ all_cb_oof = []
1140
+ all_lgb_oof = []
1141
+ all_xgb_oof = []
1142
+ all_fi = []
1143
+ feature_cols_final = None
1144
+
1145
+ for seed_idx, seed in enumerate(SEEDS):
1146
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
1147
+ gkf = GroupKFold(n_splits=N_FOLDS)
1148
+ cb_oof = np.zeros(len(df_base))
1149
+ lgb_oof = np.zeros(len(df_base))
1150
+ xgb_oof = np.zeros(len(df_base))
1151
+
1152
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
1153
+ train_mask = pd.Series(False, index=df_base.index)
1154
+ train_mask.iloc[tr_idx] = True
1155
+
1156
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
1157
+ df_base, train_mask, cat_cols, selected_features=selected_set)
1158
+
1159
+ if feature_cols_final is None:
1160
+ feature_cols_final = feat_cols_f
1161
+ print(f" Total features after selection: {len(feat_cols_f)}")
1162
+
1163
+ X_fold = df_fold[feat_cols_f].copy()
1164
+ for c in cat_cols_f:
1165
+ if c in X_fold.columns:
1166
+ X_fold[c] = X_fold[c].astype(int)
1167
+
1168
+ X_tr_df = X_fold.iloc[tr_idx]
1169
+ X_va_df = X_fold.iloc[va_idx]
1170
+ y_tr = y[tr_idx]
1171
+ y_va = y[va_idx]
1172
+
1173
+ cb = CatBoostClassifier(
1174
+ iterations=1500, depth=6, learning_rate=0.03,
1175
+ l2_leaf_reg=7, random_seed=seed, verbose=0,
1176
+ cat_features=cat_idx_f, eval_metric='AUC',
1177
+ early_stopping_rounds=100, min_data_in_leaf=10)
1178
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
1179
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
1180
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
1181
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
1182
+ cb_oof[va_idx] = cb_pred
1183
+
1184
+ if fold == N_FOLDS - 1:
1185
+ all_fi.append(cb.get_feature_importance())
1186
+ del cb, pool_tr, pool_va; gc.collect()
1187
+
1188
+ X_tr_filled = X_tr_df.fillna(-999).values
1189
+ X_va_filled = X_va_df.fillna(-999).values
1190
+
1191
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
1192
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
1193
+ lgb_params = {
1194
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1195
+ 'learning_rate': 0.03, 'num_leaves': 63, 'max_depth': 6,
1196
+ 'min_child_samples': 25, 'reg_alpha': 0.3, 'reg_lambda': 2.0,
1197
+ 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5,
1198
+ 'seed': seed
1199
+ }
1200
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=1500,
1201
+ valid_sets=[lgb_va_ds],
1202
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1203
+ lgb_pred = lgb_model.predict(X_va_filled)
1204
+ lgb_oof[va_idx] = lgb_pred
1205
+ del lgb_model; gc.collect()
1206
+
1207
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
1208
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
1209
+ xgb_params = {
1210
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1211
+ 'max_depth': 6, 'learning_rate': 0.03,
1212
+ 'subsample': 0.8, 'colsample_bytree': 0.7,
1213
+ 'reg_alpha': 0.3, 'reg_lambda': 2.0,
1214
+ 'min_child_weight': 5,
1215
+ 'seed': seed, 'verbosity': 0
1216
+ }
1217
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1500,
1218
+ evals=[(dval, 'val')],
1219
+ early_stopping_rounds=100, verbose_eval=False)
1220
+ xgb_pred = xgb_model.predict(dval)
1221
+ xgb_oof[va_idx] = xgb_pred
1222
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
1223
+
1224
+ if (fold + 1) % 5 == 0:
1225
+ print(f" Fold {fold+1}/{N_FOLDS} done")
1226
+
1227
+ cb_auc = roc_auc_score(y, cb_oof)
1228
+ lgb_auc = roc_auc_score(y, lgb_oof)
1229
+ xgb_auc = roc_auc_score(y, xgb_oof)
1230
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
1231
+
1232
+ all_cb_oof.append(cb_oof)
1233
+ all_lgb_oof.append(lgb_oof)
1234
+ all_xgb_oof.append(xgb_oof)
1235
+
1236
+ # ============================================================
1237
+ # 10. ENSEMBLE & BLEND
1238
+ # ============================================================
1239
+ print(f"\n{'='*70}")
1240
+ print(f" ENSEMBLE RESULTS (MODE={EXPERIMENT_MODE})")
1241
+ print(f"{'='*70}")
1242
+
1243
+ cb_avg = np.mean(all_cb_oof, axis=0)
1244
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
1245
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
1246
+
1247
+ cb_final_auc = roc_auc_score(y, cb_avg)
1248
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
1249
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
1250
+
1251
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
1252
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
1253
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
1254
+
1255
+ best_auc = 0
1256
+ best_weights = (0.45, 0.20, 0.35)
1257
+ for w_cb in np.arange(0.2, 0.7, 0.05):
1258
+ for w_lgb in np.arange(0.05, 0.5, 0.05):
1259
+ w_xgb = 1.0 - w_cb - w_lgb
1260
+ if w_xgb < 0.05: continue
1261
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
1262
+ auc = roc_auc_score(y, blend)
1263
+ if auc > best_auc:
1264
+ best_auc = auc
1265
+ best_weights = (w_cb, w_lgb, w_xgb)
1266
+
1267
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
1268
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
1269
+ print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
1270
+ print(f" Delta vs V38.2-PRO-V8: {best_auc - 0.8753:+.4f}")
1271
+ print(f" Delta vs V38.2-PRO-V9: {best_auc - 0.8772:+.4f}")
1272
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
1273
+
1274
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
1275
+ rank_auc = roc_auc_score(y, rank_blend)
1276
+ print(f" Rank blend: {rank_auc:.4f}")
1277
+
1278
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
1279
+ final_auc = roc_auc_score(y, final_blend_prob)
1280
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1281
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1282
+
1283
+ print(f"\n FINAL METRICS:")
1284
+ print(f" AUC: {final_auc:.4f}")
1285
+ print(f" Brier: {final_brier:.4f}")
1286
+ print(f" LogLoss: {final_logloss:.4f}")
1287
+
1288
+ # ============================================================
1289
+ # 11. FEATURE IMPORTANCE
1290
+ # ============================================================
1291
+ print(f"\n{'='*70}")
1292
+ print(f" FEATURE IMPORTANCE (MODE={EXPERIMENT_MODE})")
1293
+ print(f"{'='*70}")
1294
+
1295
+ if feature_cols_final and all_fi:
1296
+ avg_fi = np.mean(all_fi, axis=0)
1297
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
1298
+
1299
+ print(f" {'Rank':<5s} {'Feature':<55s} {'Importance':>10s}")
1300
+ print(f" {'-'*5} {'-'*55} {'-'*10}")
1301
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
1302
+ marker = ""
1303
+ if 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1304
+ elif '_resid' in fname: marker = " [RESID]"
1305
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname: marker = " [INTERACT]"
1306
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
1307
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
1308
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1309
+ elif 'ps2_' in fname: marker = " [PS2]"
1310
+ print(f" {i+1:<5d} {fname:<55s} {imp:>10.2f}{marker}")
1311
+
1312
+ v9_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_label' in f or f in ACT_LABEL_COLS)
1313
+ bert_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_bert_pca' in f)
1314
+ print(f"\n V9 activity label features in top 30: {v9_in_top30}")
1315
+ print(f" act_bert_pca features in top 30: {bert_in_top30}")
1316
+
1317
+ # ============================================================
1318
+ # 12. SAVE RESULTS
1319
+ # ============================================================
1320
+ elapsed = time.time() - start_time
1321
+
1322
+ results = {
1323
+ 'version': f'V38.2-pro-v10-mode-{EXPERIMENT_MODE}',
1324
+ 'experiment_mode': EXPERIMENT_MODE,
1325
+ 'mode_description': mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN'),
1326
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1327
+ 'elapsed_minutes': elapsed / 60,
1328
+ 'changes': [
1329
+ 'All V9 fixes carried forward',
1330
+ f'EXPERIMENT MODE: {EXPERIMENT_MODE} - {mode_desc.get(EXPERIMENT_MODE)}',
1331
+ 'NEW #20: Expanded Supp features (42 cols: row/avg/max/binary)',
1332
+ 'NEW #21: All features pre-computed in V10 CSV',
1333
+ ],
1334
+ 'comparison': {
1335
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
1336
+ 'v38_2_pro_v4': {'auc': 0.8758, 'temporal_auc': 0.8555},
1337
+ 'v38_2_pro_v8': {'auc': 0.8753, 'temporal_auc': 0.8548},
1338
+ 'v38_2_pro_v9': {'auc': 0.8772, 'temporal_auc': 0.8594},
1339
+ },
1340
+ 'temporal_validation': {
1341
+ 'per_seed': temporal_results,
1342
+ 'avg_blend': float(avg_temporal),
1343
+ },
1344
+ 'groupkfold': {
1345
+ 'best_3model_blend': float(best_auc),
1346
+ 'best_weights': [float(w) for w in best_weights],
1347
+ 'rank_blend': float(rank_auc),
1348
+ },
1349
+ 'final_metrics': {
1350
+ 'auc': float(final_auc),
1351
+ 'brier': float(final_brier),
1352
+ 'logloss': float(final_logloss),
1353
+ },
1354
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1355
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1356
+ }
1357
+
1358
+ suffix = f'_mode_{EXPERIMENT_MODE}'
1359
+ with open(os.path.join(OUTPUT_DIR, f'v38_2_pro_v10{suffix}_results.json'), 'w') as f:
1360
+ json.dump(results, f, indent=2)
1361
+
1362
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1363
+ oof_df['cb_pred'] = cb_avg
1364
+ oof_df['lgb_pred'] = lgb_avg
1365
+ oof_df['xgb_pred'] = xgb_avg
1366
+ oof_df['final_pred'] = final_blend_prob
1367
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, f'v38_2_pro_v10{suffix}_oof_predictions.csv'), index=False)
1368
+
1369
+ print(f"\n{'='*70}")
1370
+ print(f" V38.2-PRO-V10 MODE={EXPERIMENT_MODE} COMPLETE")
1371
+ print(f" Total time: {elapsed/60:.1f} minutes")
1372
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1373
+ print(f" GroupKFold AUC: {final_auc:.4f}")
1374
+ print(f" Temporal AUC: {avg_temporal:.4f}")
1375
+ print(f"{'='*70}")