catninja123 commited on
Commit
e798b65
·
verified ·
1 Parent(s): 113ae8d

Upload train_v38_2_pro_v12.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v12.py +1436 -0
train_v38_2_pro_v12.py ADDED
@@ -0,0 +1,1436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V12 MODEL - Recovered Features + Per-Program Summer
4
+ ====================================================================
5
+ Carries forward all V11 features, PLUS:
6
+ NEW #26: ACT_DIMS (14) from llm_activity_scores.json now also in CSV
7
+ NEW #27: Per-program summer features (11 new: count_geili_ge5/6/7/8, sum, std, etc.)
8
+ NEW #28: summer_award_count recovered from V4
9
+ NEW #29: Aggressive pruning maintained at 100 features
10
+
11
+ ABLATION EXPERIMENT (controlled by EXPERIMENT_MODE):
12
+ "A" = Full model with all features (pruned + tuned)
13
+ "B" = Full model - act_bert_pca (replace with labels)
14
+ "C" = Baseline (no new labels, control)
15
+ ====================================================================
16
+ """
17
+ import pandas as pd
18
+ import numpy as np
19
+ import json, os, warnings, sys, time, pickle, gc
20
+ warnings.filterwarnings('ignore')
21
+ from sklearn.model_selection import GroupKFold
22
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ from sklearn.decomposition import PCA
25
+ from scipy.stats import rankdata
26
+
27
+ try:
28
+ from catboost import CatBoostClassifier, Pool
29
+ import lightgbm as lgb
30
+ import xgboost as xgb
31
+ print("All model libraries loaded successfully")
32
+ except ImportError as e:
33
+ print(f"Missing library: {e}")
34
+ import subprocess
35
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
36
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
37
+ from catboost import CatBoostClassifier, Pool
38
+ import lightgbm as lgb
39
+ import xgboost as xgb
40
+
41
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
42
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
43
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
44
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
45
+
46
+ TARGET = 'target'
47
+ SEEDS = [42, 123, 456, 789, 2024]
48
+ N_FOLDS = 10
49
+ FEATURE_SELECT_TOP_N = 100 # V12: Maintain aggressive pruning at 100
50
+ start_time = time.time()
51
+
52
+ # ============================================================
53
+ # EXPERIMENT MODE - controls ablation variant
54
+ # ============================================================
55
+ # "A" = V8 + new labels (keep act_bert_pca)
56
+ # "B" = V8 + new labels - act_bert_pca (replace BERT with labels)
57
+ # "C" = V8 baseline (no new labels, keep BERT) = control
58
+ EXPERIMENT_MODE = os.environ.get('V9_MODE', 'A')
59
+ ABLATE_PS_BERT = False # Keep PS BERT for now (separate concern)
60
+
61
+ # Activity LLM label columns
62
+ ACT_LABEL_COLS = [
63
+ 'activity_uniqueness', 'impact_quantifiability', 'academic_depth',
64
+ 'social_impact_depth', 'institutional_prestige', 'activity_diversity',
65
+ 'entrepreneurial_initiative', 'cross_activity_synergy',
66
+ 'intellectual_generosity', 'writing_craft', 'personal_voice',
67
+ 'info_architecture', 'tone_calibration'
68
+ ]
69
+ N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
70
+
71
+ # Supp V2 expanded columns (pre-computed in V10 CSV)
72
+ SUPP_ROW_COLS = [
73
+ 'supp_school_specific_program_references', 'supp_school_specific_faculty_mentions',
74
+ 'supp_school_specific_campus_features', 'supp_prompt_specific_alignment',
75
+ 'supp_personal_connection_to_school', 'supp_intellectual_engagement_depth',
76
+ 'supp_extracurricular_alignment', 'supp_values_alignment_with_school',
77
+ 'supp_specific_future_contribution', 'supp_unique_personal_context', 'supp_composite',
78
+ ]
79
+ SUPP_STUDENT_AVG_COLS = [
80
+ 'supp_avg_school_specific_program_references', 'supp_avg_school_specific_faculty_mentions',
81
+ 'supp_avg_school_specific_campus_features', 'supp_avg_prompt_specific_alignment',
82
+ 'supp_avg_personal_connection_to_school', 'supp_avg_intellectual_engagement_depth',
83
+ 'supp_avg_extracurricular_alignment', 'supp_avg_values_alignment_with_school',
84
+ 'supp_avg_specific_future_contribution', 'supp_avg_unique_personal_context',
85
+ ]
86
+ SUPP_STUDENT_MAX_COLS = [
87
+ 'supp_max_school_specific_program_references', 'supp_max_school_specific_faculty_mentions',
88
+ 'supp_max_school_specific_campus_features', 'supp_max_prompt_specific_alignment',
89
+ 'supp_max_personal_connection_to_school', 'supp_max_intellectual_engagement_depth',
90
+ 'supp_max_extracurricular_alignment', 'supp_max_values_alignment_with_school',
91
+ 'supp_max_specific_future_contribution', 'supp_max_unique_personal_context',
92
+ ]
93
+ SUPP_STUDENT_AGG_COLS = ['supp_student_avg_composite', 'supp_student_max_composite', 'supp_student_std_composite', 'supp_n_scored']
94
+ SUPP_BINARY_COLS = ['supp_has_campus_feature', 'supp_has_faculty_mention', 'supp_has_future_contribution',
95
+ 'supp_has_personal_connection', 'supp_has_program_ref', 'supp_has_strong_supp', 'supp_high_specificity']
96
+ SUPP_ALL_COLS = SUPP_ROW_COLS + SUPP_STUDENT_AVG_COLS + SUPP_STUDENT_MAX_COLS + SUPP_STUDENT_AGG_COLS + SUPP_BINARY_COLS
97
+
98
+ def safe_num(v, default=np.nan):
99
+ if isinstance(v, (int, float)):
100
+ val = float(v)
101
+ return np.nan if val == -1 else val
102
+ if isinstance(v, str):
103
+ try:
104
+ val = float(v)
105
+ return np.nan if val == -1 else val
106
+ except:
107
+ return default
108
+ return default
109
+
110
+ # ============================================================
111
+ # 1. LOAD DATA
112
+ # ============================================================
113
+ print("=" * 70)
114
+ print(f" V38.2-PRO-V12: RECOVERED FEATURES + PER-PROGRAM SUMMER")
115
+ print(f" EXPERIMENT MODE = {EXPERIMENT_MODE}")
116
+ print("=" * 70)
117
+ mode_desc = {
118
+ 'A': 'V8 + 13 new labels (keep BERT)',
119
+ 'B': 'V8 + 13 new labels - act_bert_pca (replace)',
120
+ 'C': 'V8 baseline (no new labels, control)',
121
+ }
122
+ print(f" Mode description: {mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN')}")
123
+
124
+ # Load main feature matrix (V12 includes V10 + ACT_DIMS + per-program summer + act labels)
125
+ v12_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v12.csv')
126
+ v10_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v10.csv')
127
+ v9_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v9.csv')
128
+ v8_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v8.csv')
129
+ if os.path.exists(v12_path):
130
+ df_raw = pd.read_csv(v12_path)
131
+ print(f"V12 features loaded (V10 + ACT_DIMS + summer): {df_raw.shape}")
132
+ elif os.path.exists(v10_path):
133
+ df_raw = pd.read_csv(v10_path)
134
+ print(f"V10 features loaded: {df_raw.shape}")
135
+ elif os.path.exists(v9_path):
136
+ df_raw = pd.read_csv(v9_path)
137
+ print(f"V9 features loaded: {df_raw.shape}")
138
+ elif os.path.exists(v8_path):
139
+ df_raw = pd.read_csv(v8_path)
140
+ print(f"V8 features loaded: {df_raw.shape}")
141
+ else:
142
+ raise FileNotFoundError("No feature matrix found!")
143
+
144
+ # Load activity LLM labels
145
+ act_labels_path = os.path.join(DATA_DIR, 'act_labels_v2_results.csv')
146
+ act_labels_df = None
147
+ if EXPERIMENT_MODE in ['A', 'B'] and os.path.exists(act_labels_path):
148
+ act_labels_df = pd.read_csv(act_labels_path)
149
+ print(f"Activity LLM labels loaded: {act_labels_df.shape}")
150
+ print(f" Labels: {[c for c in act_labels_df.columns if c != 'student_id']}")
151
+ elif EXPERIMENT_MODE in ['A', 'B']:
152
+ print(f"WARNING: act_labels_v2_results.csv not found! Falling back to mode C")
153
+ EXPERIMENT_MODE = 'C'
154
+
155
+ # Load LLM features
156
+ llm_features_loaded = {}
157
+ for fname, varname in [
158
+ ('llm_activity_scores.json', 'act_scores'),
159
+ ('llm_supp_quality_all.json', 'supp_scores'),
160
+ ('llm_major_difficulty.json', 'major_diff'),
161
+ ('ps_yale_scores.json', 'ps_yale'),
162
+ ]:
163
+ fpath = os.path.join(DATA_DIR, fname)
164
+ if os.path.exists(fpath):
165
+ with open(fpath) as f:
166
+ llm_features_loaded[varname] = json.load(f)
167
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
168
+ else:
169
+ llm_features_loaded[varname] = {}
170
+
171
+ # Load raw data for ED2 round info
172
+ import re
173
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
174
+ round_lookup = {}
175
+ if os.path.exists(RAW_CSV):
176
+ print(f"\n Loading raw CSV for ED2 round info...")
177
+ try:
178
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
179
+ dtype=str, chunksize=500)
180
+ for chunk in raw_chunks:
181
+ for _, row in chunk.iterrows():
182
+ sid = str(row.get('student_id', '')).replace('.0', '')
183
+ summary = str(row.get('school_results_summary', ''))
184
+ entries = re.split(r'(?=\d+\.)', summary)
185
+ for entry in entries:
186
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
187
+ if m:
188
+ round_type = m.group(1)
189
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
190
+ if school_m:
191
+ school_name = school_m.group(1).strip()
192
+ key = f"{sid}_{school_name}"
193
+ round_lookup[key] = round_type
194
+ print(f" Round lookup built: {len(round_lookup)} entries")
195
+ except Exception as e:
196
+ print(f" Warning: Could not load raw CSV: {e}")
197
+
198
+ # ============================================================
199
+ # 2. MERGE ACTIVITY LLM LABELS INTO MAIN DATAFRAME
200
+ # ============================================================
201
+ if act_labels_df is not None and EXPERIMENT_MODE in ['A', 'B']:
202
+ # Merge on student_id
203
+ n_before = len(df_raw)
204
+ df_raw = df_raw.merge(act_labels_df, on='student_id', how='left')
205
+ assert len(df_raw) == n_before, f"Merge changed row count! {n_before} -> {len(df_raw)}"
206
+
207
+ n_with_labels = df_raw[ACT_LABEL_COLS[0]].notna().sum()
208
+ n_without = df_raw[ACT_LABEL_COLS[0]].isna().sum()
209
+ print(f"\n Activity labels merged: {n_with_labels} rows with labels, {n_without} without ({n_without/len(df_raw)*100:.1f}% NaN)")
210
+
211
+ # ============================================================
212
+ # 3. DATA CLEANING & QUALITY FIXES (same as V8)
213
+ # ============================================================
214
+ print(f"\n{'='*70}")
215
+ print(f" DATA QUALITY FIXES (V8 inherited)")
216
+ print(f"{'='*70}")
217
+
218
+ # Filter years
219
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy().reset_index(drop=True)
220
+ print(f"After filtering 2018-2019: {df.shape}")
221
+
222
+ # FIX #1: SAT=0 -> NaN
223
+ sat_zero = (df['sat'] == 0).sum()
224
+ df['has_sat'] = (df['sat'] > 0).astype(int)
225
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
226
+ print(f" FIX #1: SAT=0 -> NaN: {sat_zero} rows")
227
+
228
+ # FIX #2: TOEFL=0 -> NaN
229
+ toefl_zero = (df['toefl'] == 0).sum()
230
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
231
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
232
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows")
233
+
234
+ # FIX #3: GPA=0 -> NaN
235
+ gpa_zero = (df['gpa'] == 0).sum()
236
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
237
+ if 'has_gpa' not in df.columns:
238
+ df['has_gpa'] = df['gpa'].notna().astype(int)
239
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows")
240
+
241
+ # FIX #4: -1 -> NaN
242
+ sentinel_cols = ['taste_yearly_admits_log']
243
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
244
+ if col in df.columns:
245
+ sentinel_cols.append(col)
246
+ for col in sentinel_cols:
247
+ if col in df.columns:
248
+ n_neg1 = (df[col] == -1).sum()
249
+ df.loc[df[col] == -1, col] = np.nan
250
+ if n_neg1 > 0:
251
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows")
252
+
253
+ # FIX #5: ps_bert -> NaN for has_ps=0
254
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
255
+ no_ps_mask = df['has_ps'] == 0
256
+ for col in ps_bert_cols:
257
+ df.loc[no_ps_mask, col] = np.nan
258
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
259
+
260
+ # FIX #6: ps2_* -> NaN for has_ps=0
261
+ ps2_score_cols = [c for c in df.columns if c.startswith('ps2_') and c != 'ps2_essay_type']
262
+ for col in ps2_score_cols:
263
+ df.loc[no_ps_mask, col] = np.nan
264
+ print(f" FIX #6: ps2_* -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
265
+
266
+ # FIX #6b: Remove ps2_is_cliche_topic
267
+ if 'ps2_is_cliche_topic' in df.columns:
268
+ df.drop(columns=['ps2_is_cliche_topic'], inplace=True)
269
+
270
+ # FIX #16b (V9+PS_V5): ps5_* and ps_* V5 features -> NaN for has_ps=0
271
+ ps5_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
272
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
273
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
274
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
275
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
276
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
277
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
278
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
279
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
280
+ for col in ps5_cols:
281
+ if col in df.columns:
282
+ df.loc[no_ps_mask, col] = np.nan
283
+ print(f" FIX #16b (PS_V5): {len(ps5_cols)} ps5/ps_v5 features -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
284
+
285
+ # Ablation: Remove ps_bert if flag set
286
+ if ABLATE_PS_BERT and ps_bert_cols:
287
+ df.drop(columns=ps_bert_cols, inplace=True)
288
+ print(f" ABLATION: Removed {len(ps_bert_cols)} ps_bert_pca columns")
289
+
290
+ # FIX #8: ps_word_count -> NaN for has_ps=0
291
+ if 'ps_word_count' in df.columns:
292
+ df.loc[no_ps_mask, 'ps_word_count'] = np.nan
293
+
294
+ # FIX #9: ps_flag_* -> NaN for has_ps=0
295
+ ps_flag_cols = [c for c in df.columns if c.startswith('ps_flag_')]
296
+ for col in ps_flag_cols:
297
+ df.loc[no_ps_mask, col] = np.nan
298
+
299
+ # FIX #10: honors_* -> NaN for honors_count=0
300
+ no_honors_mask = df['honors_count'] == 0
301
+ honors_numeric_cols = [c for c in ['honors_max_score', 'honors_avg_score', 'honors_min_score',
302
+ 'honors_total_score', 'honors_quality_ratio',
303
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
304
+ 'honors_has_national'] if c in df.columns]
305
+ for col in honors_numeric_cols:
306
+ df.loc[no_honors_mask, col] = np.nan
307
+ df['has_honors'] = (df['honors_count'] > 0).astype(int)
308
+
309
+ # FIX #11: act_bert_pca_* -> NaN for act_total_count=0
310
+ no_act_mask = df['act_total_count'] == 0
311
+ act_bert_cols = [c for c in df.columns if c.startswith('act_bert_pca_')]
312
+
313
+ # V9 MODE B: Remove act_bert_pca entirely
314
+ if EXPERIMENT_MODE == 'B' and act_bert_cols:
315
+ df.drop(columns=act_bert_cols, inplace=True)
316
+ print(f" V9 MODE B: REMOVED {len(act_bert_cols)} act_bert_pca columns (replaced by LLM labels)")
317
+ act_bert_cols = []
318
+ else:
319
+ for col in act_bert_cols:
320
+ df.loc[no_act_mask, col] = np.nan
321
+ print(f" FIX #11: act_bert_pca -> NaN for act_total_count=0: {no_act_mask.sum()} rows")
322
+
323
+ # FIX #12: act_slot_pca_* -> NaN for act_total_count=0
324
+ act_slot_cols = [c for c in df.columns if c.startswith('act_slot_pca_')]
325
+ for col in act_slot_cols:
326
+ df.loc[no_act_mask, col] = np.nan
327
+
328
+ # FIX #13-14: cuilu -> NaN when cuilu_hs_total=0
329
+ no_cuilu_mask = df['cuilu_hs_total'] == 0
330
+ for col in ['cuilu_hs_to_univ', 'cuilu_hs_to_univ_pct', 'cuilu_hs_top10_rate',
331
+ 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
332
+ if col in df.columns:
333
+ df.loc[no_cuilu_mask, col] = np.nan
334
+
335
+ # FIX #15: Remove taste_yearly_admits_log
336
+ if 'taste_yearly_admits_log' in df.columns:
337
+ df.drop(columns=['taste_yearly_admits_log'], inplace=True)
338
+
339
+ df['has_act'] = (df['act_total_count'] > 0).astype(int)
340
+ df['has_cuilu'] = (df['cuilu_hs_total'] > 0).astype(int)
341
+
342
+ # V9 NEW #16: Set activity labels to NaN for act_total_count=0
343
+ if EXPERIMENT_MODE in ['A', 'B']:
344
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
345
+ for col in act_label_cols_in_df:
346
+ df.loc[no_act_mask, col] = np.nan
347
+ n_label_nan = no_act_mask.sum()
348
+ print(f" V9 NEW #16: Activity labels -> NaN for act_total_count=0: {n_label_nan} rows")
349
+
350
+ # Also create has_act_labels flag
351
+ df['has_act_labels'] = df[act_label_cols_in_df[0]].notna().astype(int)
352
+ n_with = df['has_act_labels'].sum()
353
+ print(f" has_act_labels=1: {n_with}, =0: {len(df)-n_with}")
354
+
355
+ # Create aggregate features from labels
356
+ df['act_label_mean'] = df[act_label_cols_in_df].mean(axis=1)
357
+ df['act_label_max'] = df[act_label_cols_in_df].max(axis=1)
358
+ df['act_label_min'] = df[act_label_cols_in_df].min(axis=1)
359
+ df['act_label_std'] = df[act_label_cols_in_df].std(axis=1)
360
+ df['act_label_range'] = df['act_label_max'] - df['act_label_min']
361
+ print(f" Created aggregate features: act_label_mean/max/min/std/range")
362
+
363
+ print(f"\n All V8 fixes applied. Shape: {df.shape}")
364
+
365
+ # Portfolio size transform
366
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
367
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
368
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
369
+ bins=[0, 5, 10, 15, 20, 100],
370
+ labels=[0, 1, 2, 3, 4]).astype(int)
371
+
372
+ # ED2 split
373
+ def get_detailed_round(row):
374
+ sid = str(row.get('student_id', '')).replace('.0', '')
375
+ school = str(row.get('school', ''))
376
+ key = f"{sid}_{school}"
377
+ raw_round = round_lookup.get(key, '')
378
+ if 'Early Decision II' in raw_round: return 'ED2'
379
+ elif 'Early Decision' in raw_round: return 'ED1'
380
+ elif 'Restrictive Early Action' in raw_round: return 'REA'
381
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round: return 'EA'
382
+ elif 'Regular Decision' in raw_round: return 'RD'
383
+ orig = str(row.get('round_cat', 'RD'))
384
+ if orig == 'ED': return 'ED1'
385
+ return orig
386
+
387
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
388
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
389
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
390
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
391
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
392
+ df['round_cat'] = df['round_cat_v2']
393
+
394
+ # ============================================================
395
+ # 3. PARSE LLM FEATURES (same as V8)
396
+ # ============================================================
397
+ act_scores = {}
398
+ raw = llm_features_loaded.get('act_scores', {})
399
+ if isinstance(raw, list):
400
+ for item in raw:
401
+ if isinstance(item, dict) and item.get('success', False):
402
+ sid_raw = str(item.get('student_id', ''))
403
+ act_scores[sid_raw] = item
404
+ parts = sid_raw.split('_')
405
+ for p in parts:
406
+ clean = p.replace('.0', '')
407
+ if clean.isdigit():
408
+ act_scores[clean] = item
409
+ elif isinstance(raw, dict):
410
+ for sid, scores in raw.items():
411
+ if isinstance(scores, dict):
412
+ act_scores[sid] = scores
413
+
414
+ supp_scores = {}
415
+ raw = llm_features_loaded.get('supp_scores', {})
416
+ if isinstance(raw, list):
417
+ for item in raw:
418
+ if isinstance(item, dict) and item.get('success', False):
419
+ sid = str(item.get('student_id', '')).replace('.0', '')
420
+ school = str(item.get('school', ''))
421
+ key = f"{sid}_{school}"
422
+ oq = item.get('overall_quality', 0)
423
+ if isinstance(oq, (int, float)) and oq <= 1:
424
+ continue
425
+ supp_scores[key] = item
426
+ elif isinstance(raw, dict):
427
+ for key, scores in raw.items():
428
+ if isinstance(scores, dict):
429
+ oq = scores.get('overall_quality', 0)
430
+ if isinstance(oq, (int, float)) and oq <= 1:
431
+ continue
432
+ supp_scores[key] = scores
433
+
434
+ major_diff = llm_features_loaded.get('major_diff', {})
435
+ if isinstance(major_diff, list):
436
+ major_diff = {}
437
+
438
+ ps_yale = {}
439
+ raw = llm_features_loaded.get('ps_yale', {})
440
+ if isinstance(raw, list):
441
+ for item in raw:
442
+ if isinstance(item, dict):
443
+ sid = str(item.get('student_id', '')).replace('.0', '')
444
+ ps_yale[sid] = item
445
+ elif isinstance(raw, dict):
446
+ ps_yale = raw
447
+
448
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
449
+
450
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
451
+ 'n_founder', 'n_president', 'max_scope',
452
+ 'has_publication', 'has_patent', 'has_summer_program',
453
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
454
+ 'activity_coherence', 'spike_strength']
455
+
456
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
457
+ 'has_imagination_scene', 'mentions_specific_course',
458
+ 'mentions_specific_professor', 'mentions_specific_program',
459
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
460
+
461
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
462
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
463
+ and not k.startswith('is_')]
464
+ if not PS_DIMS:
465
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
466
+ 'coherence_focus', 'overall_effectiveness']
467
+
468
+ # ============================================================
469
+ # 4. DEFINE FEATURE GROUPS
470
+ # ============================================================
471
+ STUDENT_LEVEL_NUMERIC = [
472
+ 'toefl', 'sat', 'gpa',
473
+ 'act_total_count', 'act_type_diversity',
474
+ *[f'act_slot_pca_{i}' for i in range(20)],
475
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
476
+ 'honors_count', 'honors_total_score',
477
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
478
+ 'honors_has_national', 'honors_quality_ratio',
479
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
480
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
481
+ 'cuilu_hs_total',
482
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
483
+ 'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
484
+ 'hs_overall_hist_rate',
485
+ 'summer_max_geili', 'summer_has_elite', 'summer_count',
486
+ 'summer_program_count', 'summer_difficulty_max',
487
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice', 'ps2_overall', 'ps2_mean',
488
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
489
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
490
+ 'has_honors', 'has_act', 'has_cuilu',
491
+ ]
492
+
493
+ # Conditionally include act_bert_pca (Mode A keeps, Mode B removes)
494
+ if EXPERIMENT_MODE != 'B':
495
+ STUDENT_LEVEL_NUMERIC.extend([f'act_bert_pca_{i}' for i in range(16)])
496
+
497
+ # Conditionally include ps_bert_pca
498
+ if not ABLATE_PS_BERT:
499
+ STUDENT_LEVEL_NUMERIC.extend([f'ps_bert_pca_{i}' for i in range(16)])
500
+
501
+ # V9: Include activity LLM labels
502
+ if EXPERIMENT_MODE in ['A', 'B']:
503
+ STUDENT_LEVEL_NUMERIC.extend(ACT_LABEL_COLS)
504
+ STUDENT_LEVEL_NUMERIC.extend(['act_label_mean', 'act_label_max', 'act_label_min',
505
+ 'act_label_std', 'act_label_range', 'has_act_labels'])
506
+
507
+ # V9+PS_V5: Include PS V5 hybrid features (LLM extraction + programmatic)
508
+ ps5_feature_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
509
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
510
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
511
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
512
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
513
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
514
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
515
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
516
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
517
+ STUDENT_LEVEL_NUMERIC.extend(ps5_feature_cols)
518
+ print(f" PS V5 hybrid features added: {len(ps5_feature_cols)} columns")
519
+
520
+ # PS-related features for special school_mean handling
521
+ PS_RELATED_FEATURES = set([
522
+ *[f'ps_bert_pca_{i}' for i in range(16)],
523
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice',
524
+ 'ps2_overall', 'ps2_mean',
525
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
526
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
527
+ 'ps_word_count',
528
+ ])
529
+ # V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
530
+ PS_RELATED_FEATURES.update(set(ps5_feature_cols))
531
+
532
+ # V10: Include expanded Supp features (pre-computed in CSV)
533
+ supp_in_data = [c for c in SUPP_ALL_COLS if c in df.columns]
534
+ STUDENT_LEVEL_NUMERIC.extend(supp_in_data)
535
+ print(f" Supp V2 expanded features added: {len(supp_in_data)} columns")
536
+ # Row-level supp features are school-level (student x school)
537
+ # Student-level supp aggregates capture overall supp writing abilityn
538
+
539
+ # V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
540
+ ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
541
+ 'act_label_std', 'act_label_range'])
542
+
543
+ # Add act_type_count columns
544
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
545
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
546
+
547
+ # V12 NEW #26: ACT_DIMS from CSV (these are the 14 features from llm_activity_scores.json, now pre-merged)
548
+ ACT_DIMS_CSV_COLS = [
549
+ 'act_max_power_index', 'act_avg_power_index', 'act_n_high_power',
550
+ 'act_n_founder', 'act_n_president', 'act_max_scope',
551
+ 'act_has_publication', 'act_has_patent', 'act_has_summer_program',
552
+ 'act_summer_program_tier', 'act_has_olympiad', 'act_olympiad_level',
553
+ 'act_activity_coherence', 'act_spike_strength'
554
+ ]
555
+ act_dims_in_data = [c for c in ACT_DIMS_CSV_COLS if c in df.columns]
556
+ STUDENT_LEVEL_NUMERIC.extend(act_dims_in_data)
557
+ print(f" V12 ACT_DIMS CSV features added: {len(act_dims_in_data)} columns")
558
+
559
+ # V12 NEW #27: Per-program summer features
560
+ SUMMER_PERPROGRAM_COLS = [
561
+ 'summer_n_geili_ge5', 'summer_n_geili_ge6', 'summer_n_geili_ge7', 'summer_n_geili_ge8',
562
+ 'summer_geili_sum', 'summer_geili_std', 'summer_geili_range',
563
+ 'summer_second_max_geili', 'summer_top2_avg_geili',
564
+ 'summer_geili_concentration', 'summer_award_count'
565
+ ]
566
+ summer_pp_in_data = [c for c in SUMMER_PERPROGRAM_COLS if c in df.columns]
567
+ STUDENT_LEVEL_NUMERIC.extend(summer_pp_in_data)
568
+ print(f" V12 per-program summer features added: {len(summer_pp_in_data)} columns")
569
+
570
+ # Filter to existing
571
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
572
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
573
+
574
+ KEY_STUDENT_FEATURES = [
575
+ 'toefl', 'sat', 'gpa',
576
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
577
+ 'honors_quality_ratio',
578
+ 'act_type_diversity', 'act_total_count',
579
+ 'hs_to_univ_hist_rate_smoothed',
580
+ 'summer_max_geili',
581
+ 'ps2_overall', 'ps2_character_revelation', 'ps2_craft_voice',
582
+ # V12: Add key ACT_DIMS and summer features for interactions
583
+ 'act_n_founder', 'act_spike_strength', 'act_activity_coherence',
584
+ 'summer_n_geili_ge7', 'summer_geili_sum',
585
+ ]
586
+
587
+ # V9: Add top activity labels to key features for interactions
588
+ if EXPERIMENT_MODE in ['A', 'B']:
589
+ KEY_STUDENT_FEATURES.extend(['act_label_mean', 'social_impact_depth',
590
+ 'tone_calibration', 'academic_depth'])
591
+
592
+ LLM_INTERACTION_FEATURES = [
593
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
594
+ 'supp_mean', 'supp_max', 'supp_composite', 'supp_student_avg_composite',
595
+ 'ps_mean', 'major_difficulty',
596
+ 'ps2_mean', 'ps2_overall',
597
+ # V12: Add ACT_DIMS aggregates for interactions
598
+ 'act_avg_power_index', 'act_max_power_index',
599
+ ]
600
+
601
+ # ============================================================
602
+ # 5. BUILD FEATURES
603
+ # ============================================================
604
+ def build_features_base(df):
605
+ df = df.copy()
606
+
607
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
608
+ df['year_cat'] = df['year'].astype(str)
609
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
610
+
611
+ # LLM Activity features
612
+ for dim in ACT_DIMS:
613
+ col_name = f'llm_act_{dim}'
614
+ df[col_name] = df['sid_str'].map(
615
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
616
+
617
+ # LLM Supp features - V10: already pre-computed in CSV, just compute aggregates
618
+ # supp_composite and row-level scores are already in the dataframe
619
+
620
+ # Major difficulty
621
+ def get_major_diff(row):
622
+ key = f"{row['school']}_{row['major_cat']}"
623
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
624
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
625
+
626
+ # PS Yale scores
627
+ for dim in PS_DIMS:
628
+ col_name = f'ps_{dim}'
629
+ df[col_name] = df['sid_str'].map(
630
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
631
+
632
+ # Aggregates
633
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
634
+ valid_act = df[llm_act_cols]
635
+ df['llm_act_mean'] = valid_act.mean(axis=1)
636
+ df['llm_act_max'] = valid_act.max(axis=1)
637
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
638
+
639
+ # V10: Use pre-computed supp_composite as supp_mean, and compute supp_max from row-level scores
640
+ supp_row_in_df = [c for c in SUPP_ROW_COLS if c in df.columns and c != 'supp_composite']
641
+ if supp_row_in_df:
642
+ valid_supp = df[supp_row_in_df]
643
+ df['supp_mean'] = valid_supp.mean(axis=1)
644
+ df['supp_max'] = valid_supp.max(axis=1)
645
+ elif 'supp_composite' in df.columns:
646
+ df['supp_mean'] = df['supp_composite']
647
+ df['supp_max'] = df['supp_composite']
648
+ else:
649
+ df['supp_mean'] = np.nan
650
+ df['supp_max'] = np.nan
651
+
652
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
653
+ valid_ps = df[ps_cols]
654
+ df['ps_mean'] = valid_ps.mean(axis=1)
655
+
656
+ # Basic interactions
657
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
658
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
659
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
660
+
661
+ if 'honors_avg_score' in df.columns:
662
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
663
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
664
+
665
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
666
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
667
+
668
+ # V9: Activity label interactions
669
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
670
+ df['act_label_x_supp'] = df['act_label_mean'] * df['supp_mean']
671
+ df['act_label_x_llm_act'] = df['act_label_mean'] * df['llm_act_mean']
672
+ if 'ps2_mean' in df.columns:
673
+ df['act_label_x_ps2'] = df['act_label_mean'] * df['ps2_mean']
674
+ print(f" V9: Created activity label interaction features")
675
+
676
+ # V11 NEW #24: Domain-specific interaction features
677
+ # GPA × summer elite: academic depth in elite programs
678
+ if 'gpa' in df.columns and 'summer_has_elite' in df.columns:
679
+ df['gpa_x_summer_elite'] = df['gpa'] * df['summer_has_elite']
680
+ # Honors × activity label: well-roundedness signal
681
+ if 'honors_avg_score' in df.columns and 'act_label_mean' in df.columns:
682
+ df['honors_x_act_label'] = df['honors_avg_score'] * df['act_label_mean']
683
+ # Portfolio size × supp composite: application completeness × quality
684
+ if 'portfolio_size' in df.columns and 'supp_student_avg_composite' in df.columns:
685
+ df['portfolio_x_supp_avg'] = df['portfolio_size'] * df['supp_student_avg_composite']
686
+ print(f" V11: Created 3 new domain-specific interaction features")
687
+
688
+ # Categoricals
689
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
690
+ cat_cols = [c for c in cat_cols if c in df.columns]
691
+
692
+ if 'round_cat' in df.columns:
693
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
694
+ cat_cols.append('school_round')
695
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
696
+ cat_cols.append('school_major')
697
+ if 'hs_cat' in df.columns:
698
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
699
+ cat_cols.append('school_hstype')
700
+
701
+ for c in cat_cols:
702
+ df[c] = df[c].fillna('_MISSING_').astype(str)
703
+ le = LabelEncoder()
704
+ df[c] = le.fit_transform(df[c]).astype(int)
705
+
706
+ return df, cat_cols
707
+
708
+
709
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
710
+ df = df.copy()
711
+
712
+ train_df = df[train_mask]
713
+ global_rate = train_df[TARGET].mean()
714
+
715
+ school_stats = train_df.groupby('school').agg(
716
+ school_raw_rate=(TARGET, 'mean'),
717
+ school_n_apps=(TARGET, 'count'),
718
+ school_n_admits=(TARGET, 'sum'),
719
+ ).reset_index()
720
+
721
+ SMOOTH_STRENGTH = 30
722
+ school_stats['school_base_rate'] = (
723
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
724
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
725
+ )
726
+
727
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
728
+ on='school', how='left')
729
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
730
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
731
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
732
+
733
+ # ED boost
734
+ ed1_mask = train_df['is_ed1'] == 1
735
+ rd_mask = train_df['is_early'] == 0
736
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
737
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
738
+
739
+ ed_boost_map = {}
740
+ for school in ed1_school_rates.index:
741
+ if school in rd_school_rates.index:
742
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
743
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
744
+
745
+ ed2_mask = train_df['is_ed2'] == 1
746
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
747
+ ed2_boost_map = {}
748
+ for school in ed2_school_rates.index:
749
+ if school in rd_school_rates.index:
750
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
751
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
752
+
753
+ # Residualize student features
754
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
755
+ train_has_ps = train_df[train_df['has_ps'] == 1]
756
+
757
+ # V9: Pre-compute has_act_labels=1 subset for activity label features
758
+ if 'has_act_labels' in train_df.columns:
759
+ train_has_act_labels = train_df[train_df['has_act_labels'] == 1]
760
+ else:
761
+ train_has_act_labels = train_df
762
+
763
+ resid_cols = []
764
+ for col in student_feat_available:
765
+ resid_col = f'{col}_resid'
766
+
767
+ if col in PS_RELATED_FEATURES:
768
+ school_mean_series = train_has_ps.groupby('school')[col].mean()
769
+ elif col in ACT_LABEL_FEATURES:
770
+ # V9: Use has_act_labels=1 subset for activity label features
771
+ school_mean_series = train_has_act_labels.groupby('school')[col].mean()
772
+ elif col.startswith('honors_') and col != 'honors_count':
773
+ train_has_honors = train_df[train_df['honors_count'] > 0]
774
+ school_mean_series = train_has_honors.groupby('school')[col].mean()
775
+ elif col.startswith('act_bert_pca_') or col.startswith('act_slot_pca_'):
776
+ train_has_act = train_df[train_df['act_total_count'] > 0]
777
+ school_mean_series = train_has_act.groupby('school')[col].mean()
778
+ elif col.startswith('cuilu_hs_to_univ') or col in ['cuilu_hs_top10_rate', 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
779
+ train_has_cuilu = train_df[train_df['cuilu_hs_total'] > 0]
780
+ school_mean_series = train_has_cuilu.groupby('school')[col].mean()
781
+ else:
782
+ school_mean_series = train_df.groupby('school')[col].mean()
783
+
784
+ col_school_mean = df['school'].map(school_mean_series)
785
+ df[resid_col] = df[col] - col_school_mean
786
+ resid_cols.append(resid_col)
787
+
788
+ # V9 NEW #17: Activity label PCA (reduce multicollinearity)
789
+ act_label_pca_cols = []
790
+ if EXPERIMENT_MODE in ['A', 'B']:
791
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
792
+ if act_label_cols_in_df:
793
+ # Fit PCA on training data only
794
+ train_label_data = train_df[act_label_cols_in_df].dropna()
795
+ if len(train_label_data) > N_ACT_LABEL_PCA * 2:
796
+ scaler = StandardScaler()
797
+ pca = PCA(n_components=N_ACT_LABEL_PCA, random_state=42)
798
+
799
+ train_scaled = scaler.fit_transform(train_label_data)
800
+ pca.fit(train_scaled)
801
+
802
+ # Transform all data
803
+ all_label_data = df[act_label_cols_in_df].copy()
804
+ # Fill NaN with column mean for PCA transform, then set back to NaN
805
+ has_any_label = all_label_data.notna().any(axis=1)
806
+ fill_means = train_label_data.mean()
807
+ all_label_filled = all_label_data.fillna(fill_means)
808
+ all_scaled = scaler.transform(all_label_filled)
809
+ pca_result = pca.transform(all_scaled)
810
+
811
+ for i in range(N_ACT_LABEL_PCA):
812
+ col_name = f'act_label_pca_{i}'
813
+ df[col_name] = pca_result[:, i]
814
+ # Set to NaN where original labels were all NaN
815
+ df.loc[~has_any_label, col_name] = np.nan
816
+ act_label_pca_cols.append(col_name)
817
+
818
+ var_explained = pca.explained_variance_ratio_
819
+ print(f" V9 NEW #17: Activity label PCA: {len(act_label_cols_in_df)} -> {N_ACT_LABEL_PCA} components")
820
+ print(f" Variance explained: {var_explained.sum():.3f} ({', '.join(f'{v:.3f}' for v in var_explained)})")
821
+
822
+ # ps2_mean_school_pctile
823
+ pctile_ps_cols = []
824
+ if 'ps2_mean' in df.columns:
825
+ ps_pctile_col = 'ps2_mean_school_pctile'
826
+ school_ps_distributions = {}
827
+ for school_id in train_has_ps['school'].unique():
828
+ vals = train_has_ps[train_has_ps['school'] == school_id]['ps2_mean'].dropna().values
829
+ if len(vals) > 2:
830
+ school_ps_distributions[school_id] = vals
831
+
832
+ def compute_ps_pctile(row, sd=school_ps_distributions):
833
+ school_id = row['school']
834
+ val = row['ps2_mean']
835
+ if pd.isna(val) or school_id not in sd:
836
+ return np.nan
837
+ return np.mean(sd[school_id] <= val)
838
+
839
+ df[ps_pctile_col] = df.apply(compute_ps_pctile, axis=1)
840
+ pctile_ps_cols.append(ps_pctile_col)
841
+
842
+ # V9 NEW #18: Activity label school percentile
843
+ act_label_pctile_cols = []
844
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
845
+ al_pctile_col = 'act_label_mean_school_pctile'
846
+ school_al_distributions = {}
847
+ for school_id in train_has_act_labels['school'].unique():
848
+ vals = train_has_act_labels[train_has_act_labels['school'] == school_id]['act_label_mean'].dropna().values
849
+ if len(vals) > 2:
850
+ school_al_distributions[school_id] = vals
851
+
852
+ def compute_al_pctile(row, sd=school_al_distributions):
853
+ school_id = row['school']
854
+ val = row['act_label_mean']
855
+ if pd.isna(val) or school_id not in sd:
856
+ return np.nan
857
+ return np.mean(sd[school_id] <= val)
858
+
859
+ df[al_pctile_col] = df.apply(compute_al_pctile, axis=1)
860
+ act_label_pctile_cols.append(al_pctile_col)
861
+ n_valid = df[al_pctile_col].notna().sum()
862
+ print(f" V9 NEW #18: {al_pctile_col}: {n_valid} valid values")
863
+
864
+ # Interactions
865
+ interaction_cols = []
866
+ for col in KEY_STUDENT_FEATURES:
867
+ if col in df.columns:
868
+ int_col = f'{col}_x_school_rate'
869
+ df[int_col] = df[col] * df['school_base_rate']
870
+ interaction_cols.append(int_col)
871
+
872
+ resid_col = f'{col}_resid'
873
+ if resid_col in df.columns:
874
+ int_resid_col = f'{col}_resid_x_rate'
875
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
876
+ interaction_cols.append(int_resid_col)
877
+
878
+ for col in LLM_INTERACTION_FEATURES:
879
+ if col in df.columns:
880
+ int_col = f'{col}_x_school_rate'
881
+ df[int_col] = df[col] * df['school_base_rate']
882
+ interaction_cols.append(int_col)
883
+
884
+ if 'portfolio_size' in df.columns:
885
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
886
+ interaction_cols.append('portfolio_x_school_rate')
887
+
888
+ if 'is_ed1' in df.columns:
889
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
890
+ interaction_cols.append('ed1_x_ed_boost')
891
+ if 'is_ed2' in df.columns:
892
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
893
+ interaction_cols.append('ed2_x_ed2_boost')
894
+
895
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
896
+ if flag in df.columns:
897
+ int_col = f'{flag}_x_school_rate'
898
+ df[int_col] = df[flag] * df['school_base_rate']
899
+ interaction_cols.append(int_col)
900
+
901
+ if 'ps2_mean_school_pctile' in df.columns:
902
+ df['ps2_pctile_x_school_rate'] = df['ps2_mean_school_pctile'] * df['school_base_rate']
903
+ interaction_cols.append('ps2_pctile_x_school_rate')
904
+
905
+ # V9 NEW #19: Activity label percentile x school_base_rate
906
+ if 'act_label_mean_school_pctile' in df.columns:
907
+ df['act_label_pctile_x_school_rate'] = df['act_label_mean_school_pctile'] * df['school_base_rate']
908
+ interaction_cols.append('act_label_pctile_x_school_rate')
909
+
910
+ # Student percentile within school
911
+ pctile_cols = []
912
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score', 'llm_act_mean', 'supp_mean']:
913
+ if col not in df.columns:
914
+ continue
915
+ pctile_col = f'{col}_school_pctile'
916
+ school_distributions = {}
917
+ for school_id in train_df['school'].unique():
918
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
919
+ if len(vals) > 2:
920
+ school_distributions[school_id] = vals
921
+
922
+ def compute_pctile(row, col=col, sd=school_distributions):
923
+ school_id = row['school']
924
+ val = row[col]
925
+ if pd.isna(val) or school_id not in sd:
926
+ return np.nan
927
+ return np.mean(sd[school_id] <= val)
928
+
929
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
930
+ pctile_cols.append(pctile_col)
931
+
932
+ pctile_cols.extend(pctile_ps_cols)
933
+ pctile_cols.extend(act_label_pctile_cols)
934
+
935
+ # Student competitiveness
936
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
937
+ components = []
938
+ weights = []
939
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
940
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
941
+ if col in df.columns:
942
+ components.append(df[col] / scale)
943
+ weights.append(w)
944
+ if components:
945
+ strength_df = pd.DataFrame(components).T
946
+ df['student_strength'] = strength_df.mean(axis=1)
947
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
948
+
949
+ # Build final feature list
950
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
951
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
952
+
953
+ all_feat = list(set(num_cols + cat_cols))
954
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
955
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
956
+ if remove in feature_cols:
957
+ feature_cols.remove(remove)
958
+
959
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
960
+ feature_cols = [c for c in feature_cols if c not in to_drop]
961
+
962
+ if selected_features is not None:
963
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
964
+ 'student_strength', 'strength_vs_school',
965
+ 'school_ed_boost', 'school_ed2_boost',
966
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
967
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
968
+ 'has_sat', 'has_toefl', 'has_gpa',
969
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate',
970
+ 'ps2_mean_school_pctile', 'ps2_pctile_x_school_rate',
971
+ 'has_honors', 'has_act', 'has_cuilu'}
972
+ # V9: Always keep activity label features
973
+ if EXPERIMENT_MODE in ['A', 'B']:
974
+ must_keep.update(set(act_label_pca_cols))
975
+ must_keep.update({'act_label_mean', 'act_label_max', 'act_label_std',
976
+ 'act_label_mean_school_pctile', 'act_label_pctile_x_school_rate',
977
+ 'has_act_labels', 'act_label_x_supp', 'act_label_x_llm_act'})
978
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
979
+
980
+ for c in feature_cols:
981
+ if df[c].dtype in ['float64', 'float32']:
982
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
983
+
984
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
985
+
986
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + len(act_label_pca_cols) + 5
987
+ print(f" Features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile + {len(act_label_pca_cols)} label_pca = total {len(feature_cols)}")
988
+
989
+ return df, feature_cols, cat_cols, cat_indices
990
+
991
+
992
+ # ============================================================
993
+ # 6. BUILD BASE FEATURES
994
+ # ============================================================
995
+ df_base, cat_cols = build_features_base(df)
996
+ print(f"\nBase features built. Shape: {df_base.shape}")
997
+
998
+ y = df_base[TARGET].values
999
+ groups = df_base['student_id'].values
1000
+
1001
+ # ============================================================
1002
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
1003
+ # ============================================================
1004
+ print(f"\n{'='*70}")
1005
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
1006
+ print(f"{'='*70}")
1007
+
1008
+ stage1_fi = []
1009
+ gkf_s1 = GroupKFold(n_splits=5)
1010
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
1011
+ train_mask = pd.Series(False, index=df_base.index)
1012
+ train_mask.iloc[tr_idx] = True
1013
+
1014
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
1015
+ df_base, train_mask, cat_cols)
1016
+
1017
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
1018
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
1019
+ y_tr = y[tr_idx]
1020
+ y_va = y[va_idx]
1021
+
1022
+ for c in cat_cols_f:
1023
+ if c in X_tr.columns:
1024
+ X_tr[c] = X_tr[c].astype(int)
1025
+ X_va[c] = X_va[c].astype(int)
1026
+
1027
+ cb = CatBoostClassifier(
1028
+ iterations=500, depth=6, learning_rate=0.05,
1029
+ l2_leaf_reg=7, random_seed=42, verbose=0,
1030
+ cat_features=cat_idx_f, eval_metric='AUC',
1031
+ early_stopping_rounds=50)
1032
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
1033
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
1034
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
1035
+
1036
+ fi = cb.get_feature_importance()
1037
+ stage1_fi.append(fi)
1038
+
1039
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
1040
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
1041
+
1042
+ if fold == 0:
1043
+ all_feature_names = feat_cols_f
1044
+
1045
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
1046
+
1047
+ avg_fi = np.mean(stage1_fi, axis=0)
1048
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
1049
+
1050
+ selected_set = set(cat_cols)
1051
+ n_added = 0
1052
+ for fname, imp in fi_pairs:
1053
+ if fname not in cat_cols:
1054
+ selected_set.add(fname)
1055
+ n_added += 1
1056
+ if n_added >= FEATURE_SELECT_TOP_N:
1057
+ break
1058
+
1059
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
1060
+ print(f" Top 30 features:")
1061
+ for i, (fname, imp) in enumerate(fi_pairs[:30]):
1062
+ marker = ""
1063
+ if fname in ACT_DIMS_CSV_COLS: marker = " [ACT_DIMS_V12]"
1064
+ elif fname in SUMMER_PERPROGRAM_COLS: marker = " [SUMMER_PP_V12]"
1065
+ elif 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1066
+ elif '_resid' in fname: marker = " [R]"
1067
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
1068
+ elif '_school_pctile' in fname: marker = " [P]"
1069
+ elif 'school_base_rate' in fname: marker = " [S]"
1070
+ elif 'ed_boost' in fname: marker = " [ED]"
1071
+ elif 'ps2_' in fname: marker = " [PS2]"
1072
+ elif 'ps5_' in fname or fname.startswith('ps_') and 'bert' not in fname: marker = " [PS_V5]"
1073
+ elif 'supp_' in fname: marker = " [SUPP_V2]"
1074
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1075
+ print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
1076
+
1077
+ # Count V9 new features in top 50
1078
+ v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
1079
+ ps5_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'ps5_' in f or (f.startswith('ps_') and 'bert' not in f and f in ps5_feature_cols))
1080
+ supp_v2_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'supp_' in f)
1081
+ print(f" PS V5 features in top 50: {ps5_in_top50}")
1082
+ print(f" Supp V2 features in top 50: {supp_v2_in_top50}")
1083
+ bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
1084
+ print(f"\n V9 activity label features in top 50: {v9_in_top50}")
1085
+ print(f" act_bert_pca features in top 50: {bert_in_top50}")
1086
+
1087
+ # ============================================================
1088
+ # 8. TEMPORAL VALIDATION
1089
+ # ============================================================
1090
+ print(f"\n{'='*70}")
1091
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024)")
1092
+ print(f"{'='*70}")
1093
+
1094
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
1095
+ mask_test_temporal = df_base['year'] == 2024
1096
+
1097
+ temporal_results = {}
1098
+ if mask_test_temporal.sum() > 0:
1099
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
1100
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
1101
+
1102
+ X_t = df_temporal[feat_cols_t].copy()
1103
+ for c in cat_cols_t:
1104
+ if c in X_t.columns:
1105
+ X_t[c] = X_t[c].astype(int)
1106
+
1107
+ X_tr_t = X_t[mask_train_temporal]
1108
+ X_te_t = X_t[mask_test_temporal]
1109
+ y_tr_t = y[mask_train_temporal]
1110
+ y_te_t = y[mask_test_temporal]
1111
+
1112
+ X_tr_t_filled = X_tr_t.fillna(-999)
1113
+ X_te_t_filled = X_te_t.fillna(-999)
1114
+
1115
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
1116
+
1117
+ for seed in SEEDS:
1118
+ cb_t = CatBoostClassifier(
1119
+ iterations=1500, depth=8, learning_rate=0.02,
1120
+ l2_leaf_reg=10, random_seed=seed, verbose=0,
1121
+ cat_features=cat_idx_t, eval_metric='AUC',
1122
+ early_stopping_rounds=100, min_data_in_leaf=15,
1123
+ random_strength=2, bagging_temperature=0.8)
1124
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
1125
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
1126
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
1127
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
1128
+ del cb_t; gc.collect()
1129
+
1130
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
1131
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
1132
+ lgb_params = {
1133
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1134
+ 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 7,
1135
+ 'min_child_samples': 30, 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1136
+ 'feature_fraction': 0.6, 'bagging_fraction': 0.75, 'bagging_freq': 5,
1137
+ 'seed': seed
1138
+ }
1139
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=2000,
1140
+ valid_sets=[lgb_va],
1141
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1142
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
1143
+ del lgb_model; gc.collect()
1144
+
1145
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
1146
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
1147
+ xgb_params = {
1148
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1149
+ 'max_depth': 7, 'learning_rate': 0.02,
1150
+ 'subsample': 0.75, 'colsample_bytree': 0.6,
1151
+ 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1152
+ 'min_child_weight': 7,
1153
+ 'seed': seed, 'verbosity': 0
1154
+ }
1155
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=2000,
1156
+ evals=[(dtest, 'val')],
1157
+ early_stopping_rounds=100, verbose_eval=False)
1158
+ xgb_pred = xgb_model.predict(dtest)
1159
+ del xgb_model, dtrain, dtest; gc.collect()
1160
+
1161
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
1162
+ temporal_results[seed] = {
1163
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
1164
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
1165
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
1166
+ 'blend': float(roc_auc_score(y_te_t, blend))
1167
+ }
1168
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
1169
+
1170
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
1171
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f}")
1172
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
1173
+ print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
1174
+ print(f" Delta vs V38.2-PRO-V8: {avg_temporal - 0.8548:+.4f}")
1175
+ print(f" Delta vs V38.2-PRO-V9: {avg_temporal - 0.8594:+.4f}")
1176
+ print(f" Delta vs V38.2-PRO-V10: {avg_temporal - 0.8631:+.4f}")
1177
+ print(f" Delta vs V38.2-PRO-V11: {avg_temporal - 0.8634:+.4f}")
1178
+
1179
+ del df_temporal, X_t; gc.collect()
1180
+ else:
1181
+ avg_temporal = 0.0
1182
+
1183
+ # ============================================================
1184
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
1185
+ # ============================================================
1186
+ print(f"\n{'='*70}")
1187
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
1188
+ print(f"{'='*70}")
1189
+
1190
+ all_cb_oof = []
1191
+ all_lgb_oof = []
1192
+ all_xgb_oof = []
1193
+ all_fi = []
1194
+ feature_cols_final = None
1195
+
1196
+ for seed_idx, seed in enumerate(SEEDS):
1197
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
1198
+ gkf = GroupKFold(n_splits=N_FOLDS)
1199
+ cb_oof = np.zeros(len(df_base))
1200
+ lgb_oof = np.zeros(len(df_base))
1201
+ xgb_oof = np.zeros(len(df_base))
1202
+
1203
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
1204
+ train_mask = pd.Series(False, index=df_base.index)
1205
+ train_mask.iloc[tr_idx] = True
1206
+
1207
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
1208
+ df_base, train_mask, cat_cols, selected_features=selected_set)
1209
+
1210
+ if feature_cols_final is None:
1211
+ feature_cols_final = feat_cols_f
1212
+ print(f" Total features after selection: {len(feat_cols_f)}")
1213
+
1214
+ X_fold = df_fold[feat_cols_f].copy()
1215
+ for c in cat_cols_f:
1216
+ if c in X_fold.columns:
1217
+ X_fold[c] = X_fold[c].astype(int)
1218
+
1219
+ X_tr_df = X_fold.iloc[tr_idx]
1220
+ X_va_df = X_fold.iloc[va_idx]
1221
+ y_tr = y[tr_idx]
1222
+ y_va = y[va_idx]
1223
+
1224
+ cb = CatBoostClassifier(
1225
+ iterations=2000, depth=8, learning_rate=0.02,
1226
+ l2_leaf_reg=10, random_seed=seed, verbose=0,
1227
+ cat_features=cat_idx_f, eval_metric='AUC',
1228
+ early_stopping_rounds=100, min_data_in_leaf=15,
1229
+ random_strength=2, bagging_temperature=0.8)
1230
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
1231
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
1232
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
1233
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
1234
+ cb_oof[va_idx] = cb_pred
1235
+
1236
+ if fold == N_FOLDS - 1:
1237
+ all_fi.append(cb.get_feature_importance())
1238
+ del cb, pool_tr, pool_va; gc.collect()
1239
+
1240
+ X_tr_filled = X_tr_df.fillna(-999).values
1241
+ X_va_filled = X_va_df.fillna(-999).values
1242
+
1243
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
1244
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
1245
+ lgb_params = {
1246
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1247
+ 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 7,
1248
+ 'min_child_samples': 30, 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1249
+ 'feature_fraction': 0.6, 'bagging_fraction': 0.75, 'bagging_freq': 5,
1250
+ 'seed': seed
1251
+ }
1252
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=2000,
1253
+ valid_sets=[lgb_va_ds],
1254
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1255
+ lgb_pred = lgb_model.predict(X_va_filled)
1256
+ lgb_oof[va_idx] = lgb_pred
1257
+ del lgb_model; gc.collect()
1258
+
1259
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
1260
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
1261
+ xgb_params = {
1262
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1263
+ 'max_depth': 7, 'learning_rate': 0.02,
1264
+ 'subsample': 0.75, 'colsample_bytree': 0.6,
1265
+ 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1266
+ 'min_child_weight': 7,
1267
+ 'seed': seed, 'verbosity': 0
1268
+ }
1269
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=2000,
1270
+ evals=[(dval, 'val')],
1271
+ early_stopping_rounds=100, verbose_eval=False)
1272
+ xgb_pred = xgb_model.predict(dval)
1273
+ xgb_oof[va_idx] = xgb_pred
1274
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
1275
+
1276
+ if (fold + 1) % 5 == 0:
1277
+ print(f" Fold {fold+1}/{N_FOLDS} done")
1278
+
1279
+ cb_auc = roc_auc_score(y, cb_oof)
1280
+ lgb_auc = roc_auc_score(y, lgb_oof)
1281
+ xgb_auc = roc_auc_score(y, xgb_oof)
1282
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
1283
+
1284
+ all_cb_oof.append(cb_oof)
1285
+ all_lgb_oof.append(lgb_oof)
1286
+ all_xgb_oof.append(xgb_oof)
1287
+
1288
+ # ============================================================
1289
+ # 10. ENSEMBLE & BLEND
1290
+ # ============================================================
1291
+ print(f"\n{'='*70}")
1292
+ print(f" ENSEMBLE RESULTS (MODE={EXPERIMENT_MODE})")
1293
+ print(f"{'='*70}")
1294
+
1295
+ cb_avg = np.mean(all_cb_oof, axis=0)
1296
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
1297
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
1298
+
1299
+ cb_final_auc = roc_auc_score(y, cb_avg)
1300
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
1301
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
1302
+
1303
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
1304
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
1305
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
1306
+
1307
+ # V11: Finer granularity weight search (0.02 step)
1308
+ best_auc = 0
1309
+ best_weights = (0.45, 0.20, 0.35)
1310
+ for w_cb in np.arange(0.30, 0.65, 0.02):
1311
+ for w_lgb in np.arange(0.05, 0.40, 0.02):
1312
+ w_xgb = 1.0 - w_cb - w_lgb
1313
+ if w_xgb < 0.05 or w_xgb > 0.55: continue
1314
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
1315
+ auc = roc_auc_score(y, blend)
1316
+ if auc > best_auc:
1317
+ best_auc = auc
1318
+ best_weights = (w_cb, w_lgb, w_xgb)
1319
+
1320
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
1321
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
1322
+ print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
1323
+ print(f" Delta vs V38.2-PRO-V8: {best_auc - 0.8753:+.4f}")
1324
+ print(f" Delta vs V38.2-PRO-V9: {best_auc - 0.8772:+.4f}")
1325
+ print(f" Delta vs V38.2-PRO-V10: {best_auc - 0.8784:+.4f}")
1326
+ print(f" Delta vs V38.2-PRO-V11: {best_auc - 0.8789:+.4f}")
1327
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
1328
+
1329
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
1330
+ rank_auc = roc_auc_score(y, rank_blend)
1331
+ print(f" Rank blend: {rank_auc:.4f}")
1332
+
1333
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
1334
+ final_auc = roc_auc_score(y, final_blend_prob)
1335
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1336
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1337
+
1338
+ print(f"\n FINAL METRICS:")
1339
+ print(f" AUC: {final_auc:.4f}")
1340
+ print(f" Brier: {final_brier:.4f}")
1341
+ print(f" LogLoss: {final_logloss:.4f}")
1342
+
1343
+ # ============================================================
1344
+ # 11. FEATURE IMPORTANCE
1345
+ # ============================================================
1346
+ print(f"\n{'='*70}")
1347
+ print(f" FEATURE IMPORTANCE (MODE={EXPERIMENT_MODE})")
1348
+ print(f"{'='*70}")
1349
+
1350
+ if feature_cols_final and all_fi:
1351
+ avg_fi = np.mean(all_fi, axis=0)
1352
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
1353
+
1354
+ print(f" {'Rank':<5s} {'Feature':<55s} {'Importance':>10s}")
1355
+ print(f" {'-'*5} {'-'*55} {'-'*10}")
1356
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
1357
+ marker = ""
1358
+ if fname in ACT_DIMS_CSV_COLS: marker = " [ACT_DIMS_V12]"
1359
+ elif fname in SUMMER_PERPROGRAM_COLS: marker = " [SUMMER_PP_V12]"
1360
+ elif 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1361
+ elif '_resid' in fname: marker = " [RESID]"
1362
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname: marker = " [INTERACT]"
1363
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
1364
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
1365
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1366
+ elif 'ps2_' in fname: marker = " [PS2]"
1367
+ print(f" {i+1:<5d} {fname:<55s} {imp:>10.2f}{marker}")
1368
+
1369
+ v9_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_label' in f or f in ACT_LABEL_COLS)
1370
+ bert_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_bert_pca' in f)
1371
+ print(f"\n V9 activity label features in top 30: {v9_in_top30}")
1372
+ print(f" act_bert_pca features in top 30: {bert_in_top30}")
1373
+
1374
+ # ============================================================
1375
+ # 12. SAVE RESULTS
1376
+ # ============================================================
1377
+ elapsed = time.time() - start_time
1378
+
1379
+ results = {
1380
+ 'version': f'V38.2-pro-v12-mode-{EXPERIMENT_MODE}',
1381
+ 'experiment_mode': EXPERIMENT_MODE,
1382
+ 'mode_description': mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN'),
1383
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1384
+ 'elapsed_minutes': elapsed / 60,
1385
+ 'changes': [
1386
+ 'All V11 features carried forward',
1387
+ f'EXPERIMENT MODE: {EXPERIMENT_MODE} - {mode_desc.get(EXPERIMENT_MODE)}',
1388
+ 'NEW #26: ACT_DIMS (14) from CSV',
1389
+ 'NEW #27: Per-program summer features (11)',
1390
+ 'NEW #28: summer_award_count from V4',
1391
+ 'NEW #29: Aggressive pruning at 100',
1392
+ ],
1393
+ 'comparison': {
1394
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
1395
+ 'v38_2_pro_v4': {'auc': 0.8758, 'temporal_auc': 0.8555},
1396
+ 'v38_2_pro_v8': {'auc': 0.8753, 'temporal_auc': 0.8548},
1397
+ 'v38_2_pro_v9': {'auc': 0.8772, 'temporal_auc': 0.8594},
1398
+ 'v38_2_pro_v10': {'auc': 0.8784, 'temporal_auc': 0.8631},
1399
+ 'v38_2_pro_v11': {'auc': 0.8789, 'temporal_auc': 0.8634},
1400
+ },
1401
+ 'temporal_validation': {
1402
+ 'per_seed': temporal_results,
1403
+ 'avg_blend': float(avg_temporal),
1404
+ },
1405
+ 'groupkfold': {
1406
+ 'best_3model_blend': float(best_auc),
1407
+ 'best_weights': [float(w) for w in best_weights],
1408
+ 'rank_blend': float(rank_auc),
1409
+ },
1410
+ 'final_metrics': {
1411
+ 'auc': float(final_auc),
1412
+ 'brier': float(final_brier),
1413
+ 'logloss': float(final_logloss),
1414
+ },
1415
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1416
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1417
+ }
1418
+
1419
+ suffix = f'_mode_{EXPERIMENT_MODE}'
1420
+ with open(os.path.join(OUTPUT_DIR, f'v38_2_pro_v12{suffix}_results.json'), 'w') as f:
1421
+ json.dump(results, f, indent=2)
1422
+
1423
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1424
+ oof_df['cb_pred'] = cb_avg
1425
+ oof_df['lgb_pred'] = lgb_avg
1426
+ oof_df['xgb_pred'] = xgb_avg
1427
+ oof_df['final_pred'] = final_blend_prob
1428
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, f'v38_2_pro_v12{suffix}_oof_predictions.csv'), index=False)
1429
+
1430
+ print(f"\n{'='*70}")
1431
+ print(f" V38.2-PRO-V12 MODE={EXPERIMENT_MODE} COMPLETE")
1432
+ print(f" Total time: {elapsed/60:.1f} minutes")
1433
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1434
+ print(f" GroupKFold AUC: {final_auc:.4f}")
1435
+ print(f" Temporal AUC: {avg_temporal:.4f}")
1436
+ print(f"{'='*70}")