catninja123 commited on
Commit
a7499a7
·
verified ·
1 Parent(s): 6090173

Upload train_v38_2_pro_v11.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v11.py +1397 -0
train_v38_2_pro_v11.py ADDED
@@ -0,0 +1,1397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ====================================================================
3
+ V38.2-PRO-V11 MODEL - Pruning + Hyperparameter Optimization
4
+ ====================================================================
5
+ Carries forward all V10 features, PLUS:
6
+ NEW #22: Aggressive feature pruning (150 -> 100 top features)
7
+ NEW #23: Hyperparameter tuning (deeper trees, stronger regularization)
8
+ NEW #24: 3 new domain-specific interaction features
9
+ NEW #25: Wider ensemble weight search with finer granularity
10
+
11
+ ABLATION EXPERIMENT (controlled by EXPERIMENT_MODE):
12
+ "A" = Full model with all features (pruned + tuned)
13
+ "B" = Full model - act_bert_pca (replace with labels)
14
+ "C" = Baseline (no new labels, control)
15
+ ====================================================================
16
+ """
17
+ import pandas as pd
18
+ import numpy as np
19
+ import json, os, warnings, sys, time, pickle, gc
20
+ warnings.filterwarnings('ignore')
21
+ from sklearn.model_selection import GroupKFold
22
+ from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
23
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
24
+ from sklearn.decomposition import PCA
25
+ from scipy.stats import rankdata
26
+
27
+ try:
28
+ from catboost import CatBoostClassifier, Pool
29
+ import lightgbm as lgb
30
+ import xgboost as xgb
31
+ print("All model libraries loaded successfully")
32
+ except ImportError as e:
33
+ print(f"Missing library: {e}")
34
+ import subprocess
35
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install',
36
+ 'catboost', 'lightgbm', 'xgboost', '-q'])
37
+ from catboost import CatBoostClassifier, Pool
38
+ import lightgbm as lgb
39
+ import xgboost as xgb
40
+
41
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
42
+ DATA_DIR = os.path.join(BASE_DIR, 'data')
43
+ OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
44
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
45
+
46
+ TARGET = 'target'
47
+ SEEDS = [42, 123, 456, 789, 2024]
48
+ N_FOLDS = 10
49
+ FEATURE_SELECT_TOP_N = 100 # V11: Aggressive pruning from 150 -> 100
50
+ start_time = time.time()
51
+
52
+ # ============================================================
53
+ # EXPERIMENT MODE - controls ablation variant
54
+ # ============================================================
55
+ # "A" = V8 + new labels (keep act_bert_pca)
56
+ # "B" = V8 + new labels - act_bert_pca (replace BERT with labels)
57
+ # "C" = V8 baseline (no new labels, keep BERT) = control
58
+ EXPERIMENT_MODE = os.environ.get('V9_MODE', 'A')
59
+ ABLATE_PS_BERT = False # Keep PS BERT for now (separate concern)
60
+
61
+ # Activity LLM label columns
62
+ ACT_LABEL_COLS = [
63
+ 'activity_uniqueness', 'impact_quantifiability', 'academic_depth',
64
+ 'social_impact_depth', 'institutional_prestige', 'activity_diversity',
65
+ 'entrepreneurial_initiative', 'cross_activity_synergy',
66
+ 'intellectual_generosity', 'writing_craft', 'personal_voice',
67
+ 'info_architecture', 'tone_calibration'
68
+ ]
69
+ N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
70
+
71
+ # Supp V2 expanded columns (pre-computed in V10 CSV)
72
+ SUPP_ROW_COLS = [
73
+ 'supp_school_specific_program_references', 'supp_school_specific_faculty_mentions',
74
+ 'supp_school_specific_campus_features', 'supp_prompt_specific_alignment',
75
+ 'supp_personal_connection_to_school', 'supp_intellectual_engagement_depth',
76
+ 'supp_extracurricular_alignment', 'supp_values_alignment_with_school',
77
+ 'supp_specific_future_contribution', 'supp_unique_personal_context', 'supp_composite',
78
+ ]
79
+ SUPP_STUDENT_AVG_COLS = [
80
+ 'supp_avg_school_specific_program_references', 'supp_avg_school_specific_faculty_mentions',
81
+ 'supp_avg_school_specific_campus_features', 'supp_avg_prompt_specific_alignment',
82
+ 'supp_avg_personal_connection_to_school', 'supp_avg_intellectual_engagement_depth',
83
+ 'supp_avg_extracurricular_alignment', 'supp_avg_values_alignment_with_school',
84
+ 'supp_avg_specific_future_contribution', 'supp_avg_unique_personal_context',
85
+ ]
86
+ SUPP_STUDENT_MAX_COLS = [
87
+ 'supp_max_school_specific_program_references', 'supp_max_school_specific_faculty_mentions',
88
+ 'supp_max_school_specific_campus_features', 'supp_max_prompt_specific_alignment',
89
+ 'supp_max_personal_connection_to_school', 'supp_max_intellectual_engagement_depth',
90
+ 'supp_max_extracurricular_alignment', 'supp_max_values_alignment_with_school',
91
+ 'supp_max_specific_future_contribution', 'supp_max_unique_personal_context',
92
+ ]
93
+ SUPP_STUDENT_AGG_COLS = ['supp_student_avg_composite', 'supp_student_max_composite', 'supp_student_std_composite', 'supp_n_scored']
94
+ SUPP_BINARY_COLS = ['supp_has_campus_feature', 'supp_has_faculty_mention', 'supp_has_future_contribution',
95
+ 'supp_has_personal_connection', 'supp_has_program_ref', 'supp_has_strong_supp', 'supp_high_specificity']
96
+ SUPP_ALL_COLS = SUPP_ROW_COLS + SUPP_STUDENT_AVG_COLS + SUPP_STUDENT_MAX_COLS + SUPP_STUDENT_AGG_COLS + SUPP_BINARY_COLS
97
+
98
+ def safe_num(v, default=np.nan):
99
+ if isinstance(v, (int, float)):
100
+ val = float(v)
101
+ return np.nan if val == -1 else val
102
+ if isinstance(v, str):
103
+ try:
104
+ val = float(v)
105
+ return np.nan if val == -1 else val
106
+ except:
107
+ return default
108
+ return default
109
+
110
+ # ============================================================
111
+ # 1. LOAD DATA
112
+ # ============================================================
113
+ print("=" * 70)
114
+ print(f" V38.2-PRO-V11: PRUNING + HYPERPARAMETER OPTIMIZATION")
115
+ print(f" EXPERIMENT MODE = {EXPERIMENT_MODE}")
116
+ print("=" * 70)
117
+ mode_desc = {
118
+ 'A': 'V8 + 13 new labels (keep BERT)',
119
+ 'B': 'V8 + 13 new labels - act_bert_pca (replace)',
120
+ 'C': 'V8 baseline (no new labels, control)',
121
+ }
122
+ print(f" Mode description: {mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN')}")
123
+
124
+ # Load main feature matrix (V10 includes PS V5 + expanded Supp + act labels)
125
+ v10_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v10.csv')
126
+ v9_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v9.csv')
127
+ v8_path = os.path.join(DATA_DIR, 'v38_2_integrated_features_v8.csv')
128
+ if os.path.exists(v10_path):
129
+ df_raw = pd.read_csv(v10_path)
130
+ print(f"V10 features loaded (PS V5 + expanded Supp): {df_raw.shape}")
131
+ elif os.path.exists(v9_path):
132
+ df_raw = pd.read_csv(v9_path)
133
+ print(f"V9 features loaded: {df_raw.shape}")
134
+ elif os.path.exists(v8_path):
135
+ df_raw = pd.read_csv(v8_path)
136
+ print(f"V8 features loaded: {df_raw.shape}")
137
+ else:
138
+ raise FileNotFoundError("No feature matrix found!")
139
+
140
+ # Load activity LLM labels
141
+ act_labels_path = os.path.join(DATA_DIR, 'act_labels_v2_results.csv')
142
+ act_labels_df = None
143
+ if EXPERIMENT_MODE in ['A', 'B'] and os.path.exists(act_labels_path):
144
+ act_labels_df = pd.read_csv(act_labels_path)
145
+ print(f"Activity LLM labels loaded: {act_labels_df.shape}")
146
+ print(f" Labels: {[c for c in act_labels_df.columns if c != 'student_id']}")
147
+ elif EXPERIMENT_MODE in ['A', 'B']:
148
+ print(f"WARNING: act_labels_v2_results.csv not found! Falling back to mode C")
149
+ EXPERIMENT_MODE = 'C'
150
+
151
+ # Load LLM features
152
+ llm_features_loaded = {}
153
+ for fname, varname in [
154
+ ('llm_activity_scores.json', 'act_scores'),
155
+ ('llm_supp_quality_all.json', 'supp_scores'),
156
+ ('llm_major_difficulty.json', 'major_diff'),
157
+ ('ps_yale_scores.json', 'ps_yale'),
158
+ ]:
159
+ fpath = os.path.join(DATA_DIR, fname)
160
+ if os.path.exists(fpath):
161
+ with open(fpath) as f:
162
+ llm_features_loaded[varname] = json.load(f)
163
+ print(f" Loaded {fname}: {len(llm_features_loaded[varname])} entries")
164
+ else:
165
+ llm_features_loaded[varname] = {}
166
+
167
+ # Load raw data for ED2 round info
168
+ import re
169
+ RAW_CSV = os.path.join(DATA_DIR, 'students_with_essays_merged_clean.csv')
170
+ round_lookup = {}
171
+ if os.path.exists(RAW_CSV):
172
+ print(f"\n Loading raw CSV for ED2 round info...")
173
+ try:
174
+ raw_chunks = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'],
175
+ dtype=str, chunksize=500)
176
+ for chunk in raw_chunks:
177
+ for _, row in chunk.iterrows():
178
+ sid = str(row.get('student_id', '')).replace('.0', '')
179
+ summary = str(row.get('school_results_summary', ''))
180
+ entries = re.split(r'(?=\d+\.)', summary)
181
+ for entry in entries:
182
+ m = re.search(r'(Early Decision II|Early Decision|Early Action II|Early Action|Restrictive Early Action|Regular Decision)', entry)
183
+ if m:
184
+ round_type = m.group(1)
185
+ school_m = re.search(r'\d+\.\s*(.+?)(?:\s*[-–]\s*|\s*\()', entry)
186
+ if school_m:
187
+ school_name = school_m.group(1).strip()
188
+ key = f"{sid}_{school_name}"
189
+ round_lookup[key] = round_type
190
+ print(f" Round lookup built: {len(round_lookup)} entries")
191
+ except Exception as e:
192
+ print(f" Warning: Could not load raw CSV: {e}")
193
+
194
+ # ============================================================
195
+ # 2. MERGE ACTIVITY LLM LABELS INTO MAIN DATAFRAME
196
+ # ============================================================
197
+ if act_labels_df is not None and EXPERIMENT_MODE in ['A', 'B']:
198
+ # Merge on student_id
199
+ n_before = len(df_raw)
200
+ df_raw = df_raw.merge(act_labels_df, on='student_id', how='left')
201
+ assert len(df_raw) == n_before, f"Merge changed row count! {n_before} -> {len(df_raw)}"
202
+
203
+ n_with_labels = df_raw[ACT_LABEL_COLS[0]].notna().sum()
204
+ n_without = df_raw[ACT_LABEL_COLS[0]].isna().sum()
205
+ print(f"\n Activity labels merged: {n_with_labels} rows with labels, {n_without} without ({n_without/len(df_raw)*100:.1f}% NaN)")
206
+
207
+ # ============================================================
208
+ # 3. DATA CLEANING & QUALITY FIXES (same as V8)
209
+ # ============================================================
210
+ print(f"\n{'='*70}")
211
+ print(f" DATA QUALITY FIXES (V8 inherited)")
212
+ print(f"{'='*70}")
213
+
214
+ # Filter years
215
+ df = df_raw[~df_raw['year'].isin([2018, 2019])].copy().reset_index(drop=True)
216
+ print(f"After filtering 2018-2019: {df.shape}")
217
+
218
+ # FIX #1: SAT=0 -> NaN
219
+ sat_zero = (df['sat'] == 0).sum()
220
+ df['has_sat'] = (df['sat'] > 0).astype(int)
221
+ df.loc[df['sat'] == 0, 'sat'] = np.nan
222
+ print(f" FIX #1: SAT=0 -> NaN: {sat_zero} rows")
223
+
224
+ # FIX #2: TOEFL=0 -> NaN
225
+ toefl_zero = (df['toefl'] == 0).sum()
226
+ df['has_toefl'] = (df['toefl'] > 0).astype(int)
227
+ df.loc[df['toefl'] == 0, 'toefl'] = np.nan
228
+ print(f" FIX #2: TOEFL=0 -> NaN: {toefl_zero} rows")
229
+
230
+ # FIX #3: GPA=0 -> NaN
231
+ gpa_zero = (df['gpa'] == 0).sum()
232
+ df.loc[df['gpa'] == 0, 'gpa'] = np.nan
233
+ if 'has_gpa' not in df.columns:
234
+ df['has_gpa'] = df['gpa'].notna().astype(int)
235
+ print(f" FIX #3: GPA=0 -> NaN: {gpa_zero} rows")
236
+
237
+ # FIX #4: -1 -> NaN
238
+ sentinel_cols = ['taste_yearly_admits_log']
239
+ for col in ['hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_overall_hist_rate']:
240
+ if col in df.columns:
241
+ sentinel_cols.append(col)
242
+ for col in sentinel_cols:
243
+ if col in df.columns:
244
+ n_neg1 = (df[col] == -1).sum()
245
+ df.loc[df[col] == -1, col] = np.nan
246
+ if n_neg1 > 0:
247
+ print(f" FIX #4: {col}: -1 -> NaN: {n_neg1} rows")
248
+
249
+ # FIX #5: ps_bert -> NaN for has_ps=0
250
+ ps_bert_cols = [c for c in df.columns if c.startswith('ps_bert_pca_')]
251
+ no_ps_mask = df['has_ps'] == 0
252
+ for col in ps_bert_cols:
253
+ df.loc[no_ps_mask, col] = np.nan
254
+ print(f" FIX #5: ps_bert -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
255
+
256
+ # FIX #6: ps2_* -> NaN for has_ps=0
257
+ ps2_score_cols = [c for c in df.columns if c.startswith('ps2_') and c != 'ps2_essay_type']
258
+ for col in ps2_score_cols:
259
+ df.loc[no_ps_mask, col] = np.nan
260
+ print(f" FIX #6: ps2_* -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
261
+
262
+ # FIX #6b: Remove ps2_is_cliche_topic
263
+ if 'ps2_is_cliche_topic' in df.columns:
264
+ df.drop(columns=['ps2_is_cliche_topic'], inplace=True)
265
+
266
+ # FIX #16b (V9+PS_V5): ps5_* and ps_* V5 features -> NaN for has_ps=0
267
+ ps5_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
268
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
269
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
270
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
271
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
272
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
273
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
274
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
275
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
276
+ for col in ps5_cols:
277
+ if col in df.columns:
278
+ df.loc[no_ps_mask, col] = np.nan
279
+ print(f" FIX #16b (PS_V5): {len(ps5_cols)} ps5/ps_v5 features -> NaN for has_ps=0: {no_ps_mask.sum()} rows")
280
+
281
+ # Ablation: Remove ps_bert if flag set
282
+ if ABLATE_PS_BERT and ps_bert_cols:
283
+ df.drop(columns=ps_bert_cols, inplace=True)
284
+ print(f" ABLATION: Removed {len(ps_bert_cols)} ps_bert_pca columns")
285
+
286
+ # FIX #8: ps_word_count -> NaN for has_ps=0
287
+ if 'ps_word_count' in df.columns:
288
+ df.loc[no_ps_mask, 'ps_word_count'] = np.nan
289
+
290
+ # FIX #9: ps_flag_* -> NaN for has_ps=0
291
+ ps_flag_cols = [c for c in df.columns if c.startswith('ps_flag_')]
292
+ for col in ps_flag_cols:
293
+ df.loc[no_ps_mask, col] = np.nan
294
+
295
+ # FIX #10: honors_* -> NaN for honors_count=0
296
+ no_honors_mask = df['honors_count'] == 0
297
+ honors_numeric_cols = [c for c in ['honors_max_score', 'honors_avg_score', 'honors_min_score',
298
+ 'honors_total_score', 'honors_quality_ratio',
299
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
300
+ 'honors_has_national'] if c in df.columns]
301
+ for col in honors_numeric_cols:
302
+ df.loc[no_honors_mask, col] = np.nan
303
+ df['has_honors'] = (df['honors_count'] > 0).astype(int)
304
+
305
+ # FIX #11: act_bert_pca_* -> NaN for act_total_count=0
306
+ no_act_mask = df['act_total_count'] == 0
307
+ act_bert_cols = [c for c in df.columns if c.startswith('act_bert_pca_')]
308
+
309
+ # V9 MODE B: Remove act_bert_pca entirely
310
+ if EXPERIMENT_MODE == 'B' and act_bert_cols:
311
+ df.drop(columns=act_bert_cols, inplace=True)
312
+ print(f" V9 MODE B: REMOVED {len(act_bert_cols)} act_bert_pca columns (replaced by LLM labels)")
313
+ act_bert_cols = []
314
+ else:
315
+ for col in act_bert_cols:
316
+ df.loc[no_act_mask, col] = np.nan
317
+ print(f" FIX #11: act_bert_pca -> NaN for act_total_count=0: {no_act_mask.sum()} rows")
318
+
319
+ # FIX #12: act_slot_pca_* -> NaN for act_total_count=0
320
+ act_slot_cols = [c for c in df.columns if c.startswith('act_slot_pca_')]
321
+ for col in act_slot_cols:
322
+ df.loc[no_act_mask, col] = np.nan
323
+
324
+ # FIX #13-14: cuilu -> NaN when cuilu_hs_total=0
325
+ no_cuilu_mask = df['cuilu_hs_total'] == 0
326
+ for col in ['cuilu_hs_to_univ', 'cuilu_hs_to_univ_pct', 'cuilu_hs_top10_rate',
327
+ 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
328
+ if col in df.columns:
329
+ df.loc[no_cuilu_mask, col] = np.nan
330
+
331
+ # FIX #15: Remove taste_yearly_admits_log
332
+ if 'taste_yearly_admits_log' in df.columns:
333
+ df.drop(columns=['taste_yearly_admits_log'], inplace=True)
334
+
335
+ df['has_act'] = (df['act_total_count'] > 0).astype(int)
336
+ df['has_cuilu'] = (df['cuilu_hs_total'] > 0).astype(int)
337
+
338
+ # V9 NEW #16: Set activity labels to NaN for act_total_count=0
339
+ if EXPERIMENT_MODE in ['A', 'B']:
340
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
341
+ for col in act_label_cols_in_df:
342
+ df.loc[no_act_mask, col] = np.nan
343
+ n_label_nan = no_act_mask.sum()
344
+ print(f" V9 NEW #16: Activity labels -> NaN for act_total_count=0: {n_label_nan} rows")
345
+
346
+ # Also create has_act_labels flag
347
+ df['has_act_labels'] = df[act_label_cols_in_df[0]].notna().astype(int)
348
+ n_with = df['has_act_labels'].sum()
349
+ print(f" has_act_labels=1: {n_with}, =0: {len(df)-n_with}")
350
+
351
+ # Create aggregate features from labels
352
+ df['act_label_mean'] = df[act_label_cols_in_df].mean(axis=1)
353
+ df['act_label_max'] = df[act_label_cols_in_df].max(axis=1)
354
+ df['act_label_min'] = df[act_label_cols_in_df].min(axis=1)
355
+ df['act_label_std'] = df[act_label_cols_in_df].std(axis=1)
356
+ df['act_label_range'] = df['act_label_max'] - df['act_label_min']
357
+ print(f" Created aggregate features: act_label_mean/max/min/std/range")
358
+
359
+ print(f"\n All V8 fixes applied. Shape: {df.shape}")
360
+
361
+ # Portfolio size transform
362
+ df['portfolio_size_raw'] = df['portfolio_size'].copy()
363
+ df['portfolio_size'] = np.log1p(df['portfolio_size'].clip(upper=20))
364
+ df['portfolio_size_bin'] = pd.cut(df['portfolio_size_raw'],
365
+ bins=[0, 5, 10, 15, 20, 100],
366
+ labels=[0, 1, 2, 3, 4]).astype(int)
367
+
368
+ # ED2 split
369
+ def get_detailed_round(row):
370
+ sid = str(row.get('student_id', '')).replace('.0', '')
371
+ school = str(row.get('school', ''))
372
+ key = f"{sid}_{school}"
373
+ raw_round = round_lookup.get(key, '')
374
+ if 'Early Decision II' in raw_round: return 'ED2'
375
+ elif 'Early Decision' in raw_round: return 'ED1'
376
+ elif 'Restrictive Early Action' in raw_round: return 'REA'
377
+ elif 'Early Action II' in raw_round or 'Early Action' in raw_round: return 'EA'
378
+ elif 'Regular Decision' in raw_round: return 'RD'
379
+ orig = str(row.get('round_cat', 'RD'))
380
+ if orig == 'ED': return 'ED1'
381
+ return orig
382
+
383
+ df['round_cat_v2'] = df.apply(get_detailed_round, axis=1)
384
+ df['is_ed1'] = (df['round_cat_v2'] == 'ED1').astype(int)
385
+ df['is_ed2'] = (df['round_cat_v2'] == 'ED2').astype(int)
386
+ df['is_rea'] = (df['round_cat_v2'] == 'REA').astype(int)
387
+ df['is_early'] = df['round_cat_v2'].isin(['ED1', 'ED2', 'EA', 'REA']).astype(int)
388
+ df['round_cat'] = df['round_cat_v2']
389
+
390
+ # ============================================================
391
+ # 3. PARSE LLM FEATURES (same as V8)
392
+ # ============================================================
393
+ act_scores = {}
394
+ raw = llm_features_loaded.get('act_scores', {})
395
+ if isinstance(raw, list):
396
+ for item in raw:
397
+ if isinstance(item, dict) and item.get('success', False):
398
+ sid_raw = str(item.get('student_id', ''))
399
+ act_scores[sid_raw] = item
400
+ parts = sid_raw.split('_')
401
+ for p in parts:
402
+ clean = p.replace('.0', '')
403
+ if clean.isdigit():
404
+ act_scores[clean] = item
405
+ elif isinstance(raw, dict):
406
+ for sid, scores in raw.items():
407
+ if isinstance(scores, dict):
408
+ act_scores[sid] = scores
409
+
410
+ supp_scores = {}
411
+ raw = llm_features_loaded.get('supp_scores', {})
412
+ if isinstance(raw, list):
413
+ for item in raw:
414
+ if isinstance(item, dict) and item.get('success', False):
415
+ sid = str(item.get('student_id', '')).replace('.0', '')
416
+ school = str(item.get('school', ''))
417
+ key = f"{sid}_{school}"
418
+ oq = item.get('overall_quality', 0)
419
+ if isinstance(oq, (int, float)) and oq <= 1:
420
+ continue
421
+ supp_scores[key] = item
422
+ elif isinstance(raw, dict):
423
+ for key, scores in raw.items():
424
+ if isinstance(scores, dict):
425
+ oq = scores.get('overall_quality', 0)
426
+ if isinstance(oq, (int, float)) and oq <= 1:
427
+ continue
428
+ supp_scores[key] = scores
429
+
430
+ major_diff = llm_features_loaded.get('major_diff', {})
431
+ if isinstance(major_diff, list):
432
+ major_diff = {}
433
+
434
+ ps_yale = {}
435
+ raw = llm_features_loaded.get('ps_yale', {})
436
+ if isinstance(raw, list):
437
+ for item in raw:
438
+ if isinstance(item, dict):
439
+ sid = str(item.get('student_id', '')).replace('.0', '')
440
+ ps_yale[sid] = item
441
+ elif isinstance(raw, dict):
442
+ ps_yale = raw
443
+
444
+ print(f"\nLLM features: Activity={len(act_scores)}, Supp={len(supp_scores)}, MajorDiff={len(major_diff)}, PS={len(ps_yale)}")
445
+
446
+ ACT_DIMS = ['max_power_index', 'avg_power_index', 'n_high_power',
447
+ 'n_founder', 'n_president', 'max_scope',
448
+ 'has_publication', 'has_patent', 'has_summer_program',
449
+ 'summer_program_tier', 'has_olympiad', 'olympiad_level',
450
+ 'activity_coherence', 'spike_strength']
451
+
452
+ SUPP_DIMS = ['overall_quality', 'specificity_score', 'enthusiasm_score',
453
+ 'has_imagination_scene', 'mentions_specific_course',
454
+ 'mentions_specific_professor', 'mentions_specific_program',
455
+ 'mentions_specific_facility', 'coherence_with_major', 'has_red_flag']
456
+
457
+ sample_ps = next(iter(ps_yale.values()), {}) if ps_yale else {}
458
+ PS_DIMS = [k for k in sample_ps.keys() if k not in ['student_id', 'success', 'error', 'note', 'essay_type']
459
+ and not k.startswith('is_')]
460
+ if not PS_DIMS:
461
+ PS_DIMS = ['show_not_tell', 'reflection_depth', 'authentic_voice',
462
+ 'coherence_focus', 'overall_effectiveness']
463
+
464
+ # ============================================================
465
+ # 4. DEFINE FEATURE GROUPS
466
+ # ============================================================
467
+ STUDENT_LEVEL_NUMERIC = [
468
+ 'toefl', 'sat', 'gpa',
469
+ 'act_total_count', 'act_type_diversity',
470
+ *[f'act_slot_pca_{i}' for i in range(20)],
471
+ 'honors_max_score', 'honors_avg_score', 'honors_min_score',
472
+ 'honors_count', 'honors_total_score',
473
+ 'honors_has_top_tier', 'honors_tier1_count', 'honors_tier2_count',
474
+ 'honors_has_national', 'honors_quality_ratio',
475
+ 'cuilu_hs_top10_rate', 'cuilu_hs_top20_rate',
476
+ 'cuilu_hs_top10_count', 'cuilu_hs_top20_count',
477
+ 'cuilu_hs_total',
478
+ 'cuilu_feeder_rank', 'cuilu_hs_type_rate', 'cuilu_region_rate',
479
+ 'hs_to_univ_hist_rate', 'hs_to_univ_hist_rate_smoothed', 'hs_to_univ_hist_admits',
480
+ 'hs_overall_hist_rate',
481
+ 'summer_max_geili', 'summer_has_elite', 'summer_count',
482
+ 'summer_program_count', 'summer_difficulty_max',
483
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice', 'ps2_overall', 'ps2_mean',
484
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
485
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
486
+ 'has_honors', 'has_act', 'has_cuilu',
487
+ ]
488
+
489
+ # Conditionally include act_bert_pca (Mode A keeps, Mode B removes)
490
+ if EXPERIMENT_MODE != 'B':
491
+ STUDENT_LEVEL_NUMERIC.extend([f'act_bert_pca_{i}' for i in range(16)])
492
+
493
+ # Conditionally include ps_bert_pca
494
+ if not ABLATE_PS_BERT:
495
+ STUDENT_LEVEL_NUMERIC.extend([f'ps_bert_pca_{i}' for i in range(16)])
496
+
497
+ # V9: Include activity LLM labels
498
+ if EXPERIMENT_MODE in ['A', 'B']:
499
+ STUDENT_LEVEL_NUMERIC.extend(ACT_LABEL_COLS)
500
+ STUDENT_LEVEL_NUMERIC.extend(['act_label_mean', 'act_label_max', 'act_label_min',
501
+ 'act_label_std', 'act_label_range', 'has_act_labels'])
502
+
503
+ # V9+PS_V5: Include PS V5 hybrid features (LLM extraction + programmatic)
504
+ ps5_feature_cols = [c for c in df.columns if c.startswith('ps5_') or c in [
505
+ 'ps_word_count_v5', 'ps_flesch_reading_ease', 'ps_flesch_kincaid_grade',
506
+ 'ps_gunning_fog', 'ps_coleman_liau', 'ps_lexical_diversity',
507
+ 'ps_sentence_count', 'ps_avg_sentence_length', 'ps_sentence_length_std',
508
+ 'ps_max_sentence_length', 'ps_min_sentence_length',
509
+ 'ps_sentiment_compound', 'ps_sentiment_positive', 'ps_sentiment_negative', 'ps_sentiment_neutral',
510
+ 'ps_paragraph_count', 'ps_i_count', 'ps_i_ratio', 'ps_we_count', 'ps_my_count',
511
+ 'ps_question_count', 'ps_exclamation_count', 'ps_has_dialogue', 'ps_quote_count',
512
+ 'ps_avg_word_length', 'ps_long_word_ratio', 'ps_transition_count', 'ps_power_word_count']]
513
+ STUDENT_LEVEL_NUMERIC.extend(ps5_feature_cols)
514
+ print(f" PS V5 hybrid features added: {len(ps5_feature_cols)} columns")
515
+
516
+ # PS-related features for special school_mean handling
517
+ PS_RELATED_FEATURES = set([
518
+ *[f'ps_bert_pca_{i}' for i in range(16)],
519
+ 'ps2_character_revelation', 'ps2_reflection_depth', 'ps2_craft_voice',
520
+ 'ps2_overall', 'ps2_mean',
521
+ 'ps2_is_ai_written', 'ps2_is_consultant_heavy', 'ps2_is_resume_essay',
522
+ 'ps2_is_trauma_porn', 'ps2_has_factual_concerns',
523
+ 'ps_word_count',
524
+ ])
525
+ # V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
526
+ PS_RELATED_FEATURES.update(set(ps5_feature_cols))
527
+
528
+ # V10: Include expanded Supp features (pre-computed in CSV)
529
+ supp_in_data = [c for c in SUPP_ALL_COLS if c in df.columns]
530
+ STUDENT_LEVEL_NUMERIC.extend(supp_in_data)
531
+ print(f" Supp V2 expanded features added: {len(supp_in_data)} columns")
532
+ # Row-level supp features are school-level (student x school)
533
+ # Student-level supp aggregates capture overall supp writing abilityn
534
+
535
+ # V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
536
+ ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
537
+ 'act_label_std', 'act_label_range'])
538
+
539
+ # Add act_type_count columns
540
+ act_type_cols_in_data = [c for c in df.columns if c.startswith('act_type_count_')]
541
+ STUDENT_LEVEL_NUMERIC.extend(act_type_cols_in_data)
542
+
543
+ # Filter to existing
544
+ STUDENT_LEVEL_NUMERIC = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
545
+ print(f"\n Student-level numeric features: {len(STUDENT_LEVEL_NUMERIC)}")
546
+
547
+ KEY_STUDENT_FEATURES = [
548
+ 'toefl', 'sat', 'gpa',
549
+ 'honors_max_score', 'honors_avg_score', 'honors_count',
550
+ 'honors_quality_ratio',
551
+ 'act_type_diversity', 'act_total_count',
552
+ 'hs_to_univ_hist_rate_smoothed',
553
+ 'summer_max_geili',
554
+ 'ps2_overall', 'ps2_character_revelation', 'ps2_craft_voice',
555
+ ]
556
+
557
+ # V9: Add top activity labels to key features for interactions
558
+ if EXPERIMENT_MODE in ['A', 'B']:
559
+ KEY_STUDENT_FEATURES.extend(['act_label_mean', 'social_impact_depth',
560
+ 'tone_calibration', 'academic_depth'])
561
+
562
+ LLM_INTERACTION_FEATURES = [
563
+ 'llm_act_mean', 'llm_act_max', 'llm_act_avg_power_index',
564
+ 'supp_mean', 'supp_max', 'supp_composite', 'supp_student_avg_composite',
565
+ 'ps_mean', 'major_difficulty',
566
+ 'ps2_mean', 'ps2_overall',
567
+ ]
568
+
569
+ # ============================================================
570
+ # 5. BUILD FEATURES
571
+ # ============================================================
572
+ def build_features_base(df):
573
+ df = df.copy()
574
+
575
+ df['is_partial_year'] = (df['year'] == 2025).astype(int)
576
+ df['year_cat'] = df['year'].astype(str)
577
+ df['sid_str'] = df['student_id'].astype(str).str.replace('.0', '', regex=False)
578
+
579
+ # LLM Activity features
580
+ for dim in ACT_DIMS:
581
+ col_name = f'llm_act_{dim}'
582
+ df[col_name] = df['sid_str'].map(
583
+ lambda s, d=dim: safe_num(act_scores.get(s, {}).get(d, np.nan)))
584
+
585
+ # LLM Supp features - V10: already pre-computed in CSV, just compute aggregates
586
+ # supp_composite and row-level scores are already in the dataframe
587
+
588
+ # Major difficulty
589
+ def get_major_diff(row):
590
+ key = f"{row['school']}_{row['major_cat']}"
591
+ return safe_num(major_diff.get(key, {}).get('difficulty_score', np.nan))
592
+ df['major_difficulty'] = df.apply(get_major_diff, axis=1)
593
+
594
+ # PS Yale scores
595
+ for dim in PS_DIMS:
596
+ col_name = f'ps_{dim}'
597
+ df[col_name] = df['sid_str'].map(
598
+ lambda s, d=dim: safe_num(ps_yale.get(s, {}).get(d, np.nan)))
599
+
600
+ # Aggregates
601
+ llm_act_cols = [f'llm_act_{d}' for d in ACT_DIMS]
602
+ valid_act = df[llm_act_cols]
603
+ df['llm_act_mean'] = valid_act.mean(axis=1)
604
+ df['llm_act_max'] = valid_act.max(axis=1)
605
+ df['llm_act_n_valid'] = valid_act.notna().sum(axis=1)
606
+
607
+ # V10: Use pre-computed supp_composite as supp_mean, and compute supp_max from row-level scores
608
+ supp_row_in_df = [c for c in SUPP_ROW_COLS if c in df.columns and c != 'supp_composite']
609
+ if supp_row_in_df:
610
+ valid_supp = df[supp_row_in_df]
611
+ df['supp_mean'] = valid_supp.mean(axis=1)
612
+ df['supp_max'] = valid_supp.max(axis=1)
613
+ elif 'supp_composite' in df.columns:
614
+ df['supp_mean'] = df['supp_composite']
615
+ df['supp_max'] = df['supp_composite']
616
+ else:
617
+ df['supp_mean'] = np.nan
618
+ df['supp_max'] = np.nan
619
+
620
+ ps_cols = [f'ps_{d}' for d in PS_DIMS]
621
+ valid_ps = df[ps_cols]
622
+ df['ps_mean'] = valid_ps.mean(axis=1)
623
+
624
+ # Basic interactions
625
+ df['toefl_x_sat'] = df['toefl'] * df['sat'] / 10000.0
626
+ df['gpa_x_toefl'] = df['gpa'] * df['toefl'] / 100.0
627
+ df['llm_act_x_supp'] = df['llm_act_mean'] * df['supp_mean']
628
+
629
+ if 'honors_avg_score' in df.columns:
630
+ df['honors_x_sat'] = df['honors_avg_score'] * df['sat'] / 1600
631
+ df['honors_x_toefl'] = df['honors_avg_score'] * df['toefl'] / 120
632
+
633
+ if 'cuilu_hs_top10_rate' in df.columns and 'taste_score_sensitivity' in df.columns:
634
+ df['cuilu_x_taste'] = df['cuilu_hs_top10_rate'] * df['taste_score_sensitivity']
635
+
636
+ # V9: Activity label interactions
637
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
638
+ df['act_label_x_supp'] = df['act_label_mean'] * df['supp_mean']
639
+ df['act_label_x_llm_act'] = df['act_label_mean'] * df['llm_act_mean']
640
+ if 'ps2_mean' in df.columns:
641
+ df['act_label_x_ps2'] = df['act_label_mean'] * df['ps2_mean']
642
+ print(f" V9: Created activity label interaction features")
643
+
644
+ # V11 NEW #24: Domain-specific interaction features
645
+ # GPA × summer elite: academic depth in elite programs
646
+ if 'gpa' in df.columns and 'summer_has_elite' in df.columns:
647
+ df['gpa_x_summer_elite'] = df['gpa'] * df['summer_has_elite']
648
+ # Honors × activity label: well-roundedness signal
649
+ if 'honors_avg_score' in df.columns and 'act_label_mean' in df.columns:
650
+ df['honors_x_act_label'] = df['honors_avg_score'] * df['act_label_mean']
651
+ # Portfolio size × supp composite: application completeness × quality
652
+ if 'portfolio_size' in df.columns and 'supp_student_avg_composite' in df.columns:
653
+ df['portfolio_x_supp_avg'] = df['portfolio_size'] * df['supp_student_avg_composite']
654
+ print(f" V11: Created 3 new domain-specific interaction features")
655
+
656
+ # Categoricals
657
+ cat_cols = ['school', 'round_cat', 'major_cat', 'hs_cat', 'year_cat', 'hs_name', 'province']
658
+ cat_cols = [c for c in cat_cols if c in df.columns]
659
+
660
+ if 'round_cat' in df.columns:
661
+ df['school_round'] = df['school'].astype(str) + '_' + df['round_cat'].astype(str)
662
+ cat_cols.append('school_round')
663
+ df['school_major'] = df['school'].astype(str) + '_' + df['major_cat'].astype(str)
664
+ cat_cols.append('school_major')
665
+ if 'hs_cat' in df.columns:
666
+ df['school_hstype'] = df['school'].astype(str) + '_' + df['hs_cat'].astype(str)
667
+ cat_cols.append('school_hstype')
668
+
669
+ for c in cat_cols:
670
+ df[c] = df[c].fillna('_MISSING_').astype(str)
671
+ le = LabelEncoder()
672
+ df[c] = le.fit_transform(df[c]).astype(int)
673
+
674
+ return df, cat_cols
675
+
676
+
677
+ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
678
+ df = df.copy()
679
+
680
+ train_df = df[train_mask]
681
+ global_rate = train_df[TARGET].mean()
682
+
683
+ school_stats = train_df.groupby('school').agg(
684
+ school_raw_rate=(TARGET, 'mean'),
685
+ school_n_apps=(TARGET, 'count'),
686
+ school_n_admits=(TARGET, 'sum'),
687
+ ).reset_index()
688
+
689
+ SMOOTH_STRENGTH = 30
690
+ school_stats['school_base_rate'] = (
691
+ (school_stats['school_raw_rate'] * school_stats['school_n_apps'] + global_rate * SMOOTH_STRENGTH) /
692
+ (school_stats['school_n_apps'] + SMOOTH_STRENGTH)
693
+ )
694
+
695
+ df = df.merge(school_stats[['school', 'school_base_rate', 'school_n_apps', 'school_n_admits']],
696
+ on='school', how='left')
697
+ df['school_base_rate'] = df['school_base_rate'].fillna(global_rate)
698
+ df['school_n_apps'] = df['school_n_apps'].fillna(0)
699
+ df['school_n_admits'] = df['school_n_admits'].fillna(0)
700
+
701
+ # ED boost
702
+ ed1_mask = train_df['is_ed1'] == 1
703
+ rd_mask = train_df['is_early'] == 0
704
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
705
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
706
+
707
+ ed_boost_map = {}
708
+ for school in ed1_school_rates.index:
709
+ if school in rd_school_rates.index:
710
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
711
+ df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
712
+
713
+ ed2_mask = train_df['is_ed2'] == 1
714
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
715
+ ed2_boost_map = {}
716
+ for school in ed2_school_rates.index:
717
+ if school in rd_school_rates.index:
718
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
719
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
720
+
721
+ # Residualize student features
722
+ student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
723
+ train_has_ps = train_df[train_df['has_ps'] == 1]
724
+
725
+ # V9: Pre-compute has_act_labels=1 subset for activity label features
726
+ if 'has_act_labels' in train_df.columns:
727
+ train_has_act_labels = train_df[train_df['has_act_labels'] == 1]
728
+ else:
729
+ train_has_act_labels = train_df
730
+
731
+ resid_cols = []
732
+ for col in student_feat_available:
733
+ resid_col = f'{col}_resid'
734
+
735
+ if col in PS_RELATED_FEATURES:
736
+ school_mean_series = train_has_ps.groupby('school')[col].mean()
737
+ elif col in ACT_LABEL_FEATURES:
738
+ # V9: Use has_act_labels=1 subset for activity label features
739
+ school_mean_series = train_has_act_labels.groupby('school')[col].mean()
740
+ elif col.startswith('honors_') and col != 'honors_count':
741
+ train_has_honors = train_df[train_df['honors_count'] > 0]
742
+ school_mean_series = train_has_honors.groupby('school')[col].mean()
743
+ elif col.startswith('act_bert_pca_') or col.startswith('act_slot_pca_'):
744
+ train_has_act = train_df[train_df['act_total_count'] > 0]
745
+ school_mean_series = train_has_act.groupby('school')[col].mean()
746
+ elif col.startswith('cuilu_hs_to_univ') or col in ['cuilu_hs_top10_rate', 'cuilu_hs_top20_rate', 'cuilu_hs_top10_count', 'cuilu_hs_top20_count']:
747
+ train_has_cuilu = train_df[train_df['cuilu_hs_total'] > 0]
748
+ school_mean_series = train_has_cuilu.groupby('school')[col].mean()
749
+ else:
750
+ school_mean_series = train_df.groupby('school')[col].mean()
751
+
752
+ col_school_mean = df['school'].map(school_mean_series)
753
+ df[resid_col] = df[col] - col_school_mean
754
+ resid_cols.append(resid_col)
755
+
756
+ # V9 NEW #17: Activity label PCA (reduce multicollinearity)
757
+ act_label_pca_cols = []
758
+ if EXPERIMENT_MODE in ['A', 'B']:
759
+ act_label_cols_in_df = [c for c in ACT_LABEL_COLS if c in df.columns]
760
+ if act_label_cols_in_df:
761
+ # Fit PCA on training data only
762
+ train_label_data = train_df[act_label_cols_in_df].dropna()
763
+ if len(train_label_data) > N_ACT_LABEL_PCA * 2:
764
+ scaler = StandardScaler()
765
+ pca = PCA(n_components=N_ACT_LABEL_PCA, random_state=42)
766
+
767
+ train_scaled = scaler.fit_transform(train_label_data)
768
+ pca.fit(train_scaled)
769
+
770
+ # Transform all data
771
+ all_label_data = df[act_label_cols_in_df].copy()
772
+ # Fill NaN with column mean for PCA transform, then set back to NaN
773
+ has_any_label = all_label_data.notna().any(axis=1)
774
+ fill_means = train_label_data.mean()
775
+ all_label_filled = all_label_data.fillna(fill_means)
776
+ all_scaled = scaler.transform(all_label_filled)
777
+ pca_result = pca.transform(all_scaled)
778
+
779
+ for i in range(N_ACT_LABEL_PCA):
780
+ col_name = f'act_label_pca_{i}'
781
+ df[col_name] = pca_result[:, i]
782
+ # Set to NaN where original labels were all NaN
783
+ df.loc[~has_any_label, col_name] = np.nan
784
+ act_label_pca_cols.append(col_name)
785
+
786
+ var_explained = pca.explained_variance_ratio_
787
+ print(f" V9 NEW #17: Activity label PCA: {len(act_label_cols_in_df)} -> {N_ACT_LABEL_PCA} components")
788
+ print(f" Variance explained: {var_explained.sum():.3f} ({', '.join(f'{v:.3f}' for v in var_explained)})")
789
+
790
+ # ps2_mean_school_pctile
791
+ pctile_ps_cols = []
792
+ if 'ps2_mean' in df.columns:
793
+ ps_pctile_col = 'ps2_mean_school_pctile'
794
+ school_ps_distributions = {}
795
+ for school_id in train_has_ps['school'].unique():
796
+ vals = train_has_ps[train_has_ps['school'] == school_id]['ps2_mean'].dropna().values
797
+ if len(vals) > 2:
798
+ school_ps_distributions[school_id] = vals
799
+
800
+ def compute_ps_pctile(row, sd=school_ps_distributions):
801
+ school_id = row['school']
802
+ val = row['ps2_mean']
803
+ if pd.isna(val) or school_id not in sd:
804
+ return np.nan
805
+ return np.mean(sd[school_id] <= val)
806
+
807
+ df[ps_pctile_col] = df.apply(compute_ps_pctile, axis=1)
808
+ pctile_ps_cols.append(ps_pctile_col)
809
+
810
+ # V9 NEW #18: Activity label school percentile
811
+ act_label_pctile_cols = []
812
+ if EXPERIMENT_MODE in ['A', 'B'] and 'act_label_mean' in df.columns:
813
+ al_pctile_col = 'act_label_mean_school_pctile'
814
+ school_al_distributions = {}
815
+ for school_id in train_has_act_labels['school'].unique():
816
+ vals = train_has_act_labels[train_has_act_labels['school'] == school_id]['act_label_mean'].dropna().values
817
+ if len(vals) > 2:
818
+ school_al_distributions[school_id] = vals
819
+
820
+ def compute_al_pctile(row, sd=school_al_distributions):
821
+ school_id = row['school']
822
+ val = row['act_label_mean']
823
+ if pd.isna(val) or school_id not in sd:
824
+ return np.nan
825
+ return np.mean(sd[school_id] <= val)
826
+
827
+ df[al_pctile_col] = df.apply(compute_al_pctile, axis=1)
828
+ act_label_pctile_cols.append(al_pctile_col)
829
+ n_valid = df[al_pctile_col].notna().sum()
830
+ print(f" V9 NEW #18: {al_pctile_col}: {n_valid} valid values")
831
+
832
+ # Interactions
833
+ interaction_cols = []
834
+ for col in KEY_STUDENT_FEATURES:
835
+ if col in df.columns:
836
+ int_col = f'{col}_x_school_rate'
837
+ df[int_col] = df[col] * df['school_base_rate']
838
+ interaction_cols.append(int_col)
839
+
840
+ resid_col = f'{col}_resid'
841
+ if resid_col in df.columns:
842
+ int_resid_col = f'{col}_resid_x_rate'
843
+ df[int_resid_col] = df[resid_col] * df['school_base_rate']
844
+ interaction_cols.append(int_resid_col)
845
+
846
+ for col in LLM_INTERACTION_FEATURES:
847
+ if col in df.columns:
848
+ int_col = f'{col}_x_school_rate'
849
+ df[int_col] = df[col] * df['school_base_rate']
850
+ interaction_cols.append(int_col)
851
+
852
+ if 'portfolio_size' in df.columns:
853
+ df['portfolio_x_school_rate'] = df['portfolio_size'] * df['school_base_rate']
854
+ interaction_cols.append('portfolio_x_school_rate')
855
+
856
+ if 'is_ed1' in df.columns:
857
+ df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
858
+ interaction_cols.append('ed1_x_ed_boost')
859
+ if 'is_ed2' in df.columns:
860
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
861
+ interaction_cols.append('ed2_x_ed2_boost')
862
+
863
+ for flag in ['has_sat', 'has_toefl', 'has_gpa']:
864
+ if flag in df.columns:
865
+ int_col = f'{flag}_x_school_rate'
866
+ df[int_col] = df[flag] * df['school_base_rate']
867
+ interaction_cols.append(int_col)
868
+
869
+ if 'ps2_mean_school_pctile' in df.columns:
870
+ df['ps2_pctile_x_school_rate'] = df['ps2_mean_school_pctile'] * df['school_base_rate']
871
+ interaction_cols.append('ps2_pctile_x_school_rate')
872
+
873
+ # V9 NEW #19: Activity label percentile x school_base_rate
874
+ if 'act_label_mean_school_pctile' in df.columns:
875
+ df['act_label_pctile_x_school_rate'] = df['act_label_mean_school_pctile'] * df['school_base_rate']
876
+ interaction_cols.append('act_label_pctile_x_school_rate')
877
+
878
+ # Student percentile within school
879
+ pctile_cols = []
880
+ for col in ['toefl', 'sat', 'gpa', 'honors_max_score', 'llm_act_mean', 'supp_mean']:
881
+ if col not in df.columns:
882
+ continue
883
+ pctile_col = f'{col}_school_pctile'
884
+ school_distributions = {}
885
+ for school_id in train_df['school'].unique():
886
+ vals = train_df[train_df['school'] == school_id][col].dropna().values
887
+ if len(vals) > 2:
888
+ school_distributions[school_id] = vals
889
+
890
+ def compute_pctile(row, col=col, sd=school_distributions):
891
+ school_id = row['school']
892
+ val = row[col]
893
+ if pd.isna(val) or school_id not in sd:
894
+ return np.nan
895
+ return np.mean(sd[school_id] <= val)
896
+
897
+ df[pctile_col] = df.apply(compute_pctile, axis=1)
898
+ pctile_cols.append(pctile_col)
899
+
900
+ pctile_cols.extend(pctile_ps_cols)
901
+ pctile_cols.extend(act_label_pctile_cols)
902
+
903
+ # Student competitiveness
904
+ if all(c in df.columns for c in ['toefl', 'sat', 'honors_max_score']):
905
+ components = []
906
+ weights = []
907
+ for col, w, scale in [('toefl', 0.3, 120), ('sat', 0.3, 1600),
908
+ ('honors_max_score', 0.2, 10), ('llm_act_mean', 0.2, 10)]:
909
+ if col in df.columns:
910
+ components.append(df[col] / scale)
911
+ weights.append(w)
912
+ if components:
913
+ strength_df = pd.DataFrame(components).T
914
+ df['student_strength'] = strength_df.mean(axis=1)
915
+ df['strength_vs_school'] = df['student_strength'] - (1 - df['school_base_rate'])
916
+
917
+ # Build final feature list
918
+ num_cols = [c for c in df.columns if df[c].dtype in ['float64', 'int64', 'float32', 'int32']
919
+ and c not in [TARGET, 'student_id', 'year', 'Unnamed: 0']]
920
+
921
+ all_feat = list(set(num_cols + cat_cols))
922
+ feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
923
+ for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size_raw']:
924
+ if remove in feature_cols:
925
+ feature_cols.remove(remove)
926
+
927
+ to_drop = [c for c in feature_cols if df[c].nunique() <= 1]
928
+ feature_cols = [c for c in feature_cols if c not in to_drop]
929
+
930
+ if selected_features is not None:
931
+ must_keep = set(cat_cols) | {'school_base_rate', 'school_n_apps', 'school_n_admits',
932
+ 'student_strength', 'strength_vs_school',
933
+ 'school_ed_boost', 'school_ed2_boost',
934
+ 'is_ed1', 'is_ed2', 'is_rea', 'is_early',
935
+ 'ed1_x_ed_boost', 'ed2_x_ed2_boost',
936
+ 'has_sat', 'has_toefl', 'has_gpa',
937
+ 'portfolio_size', 'portfolio_size_bin', 'portfolio_x_school_rate',
938
+ 'ps2_mean_school_pctile', 'ps2_pctile_x_school_rate',
939
+ 'has_honors', 'has_act', 'has_cuilu'}
940
+ # V9: Always keep activity label features
941
+ if EXPERIMENT_MODE in ['A', 'B']:
942
+ must_keep.update(set(act_label_pca_cols))
943
+ must_keep.update({'act_label_mean', 'act_label_max', 'act_label_std',
944
+ 'act_label_mean_school_pctile', 'act_label_pctile_x_school_rate',
945
+ 'has_act_labels', 'act_label_x_supp', 'act_label_x_llm_act'})
946
+ feature_cols = [c for c in feature_cols if c in selected_features or c in must_keep]
947
+
948
+ for c in feature_cols:
949
+ if df[c].dtype in ['float64', 'float32']:
950
+ df[c] = df[c].replace([np.inf, -np.inf], np.nan)
951
+
952
+ cat_indices = [feature_cols.index(c) for c in cat_cols if c in feature_cols]
953
+
954
+ new_feat_count = len(resid_cols) + len(interaction_cols) + len(pctile_cols) + len(act_label_pca_cols) + 5
955
+ print(f" Features: {len(resid_cols)} resid + {len(interaction_cols)} interact + {len(pctile_cols)} pctile + {len(act_label_pca_cols)} label_pca = total {len(feature_cols)}")
956
+
957
+ return df, feature_cols, cat_cols, cat_indices
958
+
959
+
960
+ # ============================================================
961
+ # 6. BUILD BASE FEATURES
962
+ # ============================================================
963
+ df_base, cat_cols = build_features_base(df)
964
+ print(f"\nBase features built. Shape: {df_base.shape}")
965
+
966
+ y = df_base[TARGET].values
967
+ groups = df_base['student_id'].values
968
+
969
+ # ============================================================
970
+ # 7. STAGE 1: FEATURE IMPORTANCE ESTIMATION
971
+ # ============================================================
972
+ print(f"\n{'='*70}")
973
+ print(f" STAGE 1: FEATURE IMPORTANCE ESTIMATION")
974
+ print(f"{'='*70}")
975
+
976
+ stage1_fi = []
977
+ gkf_s1 = GroupKFold(n_splits=5)
978
+ for fold, (tr_idx, va_idx) in enumerate(gkf_s1.split(df_base, y, groups)):
979
+ train_mask = pd.Series(False, index=df_base.index)
980
+ train_mask.iloc[tr_idx] = True
981
+
982
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
983
+ df_base, train_mask, cat_cols)
984
+
985
+ X_tr = df_fold[feat_cols_f].iloc[tr_idx]
986
+ X_va = df_fold[feat_cols_f].iloc[va_idx]
987
+ y_tr = y[tr_idx]
988
+ y_va = y[va_idx]
989
+
990
+ for c in cat_cols_f:
991
+ if c in X_tr.columns:
992
+ X_tr[c] = X_tr[c].astype(int)
993
+ X_va[c] = X_va[c].astype(int)
994
+
995
+ cb = CatBoostClassifier(
996
+ iterations=500, depth=6, learning_rate=0.05,
997
+ l2_leaf_reg=7, random_seed=42, verbose=0,
998
+ cat_features=cat_idx_f, eval_metric='AUC',
999
+ early_stopping_rounds=50)
1000
+ pool_tr = Pool(X_tr, y_tr, cat_features=cat_idx_f)
1001
+ pool_va = Pool(X_va, y_va, cat_features=cat_idx_f)
1002
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
1003
+
1004
+ fi = cb.get_feature_importance()
1005
+ stage1_fi.append(fi)
1006
+
1007
+ auc = roc_auc_score(y_va, cb.predict_proba(Pool(X_va, cat_features=cat_idx_f))[:, 1])
1008
+ print(f" Fold {fold+1}/5: AUC={auc:.4f}, Features={len(feat_cols_f)}")
1009
+
1010
+ if fold == 0:
1011
+ all_feature_names = feat_cols_f
1012
+
1013
+ del cb, pool_tr, pool_va, df_fold; gc.collect()
1014
+
1015
+ avg_fi = np.mean(stage1_fi, axis=0)
1016
+ fi_pairs = sorted(zip(all_feature_names, avg_fi), key=lambda x: -x[1])
1017
+
1018
+ selected_set = set(cat_cols)
1019
+ n_added = 0
1020
+ for fname, imp in fi_pairs:
1021
+ if fname not in cat_cols:
1022
+ selected_set.add(fname)
1023
+ n_added += 1
1024
+ if n_added >= FEATURE_SELECT_TOP_N:
1025
+ break
1026
+
1027
+ print(f"\n Feature selection: {len(all_feature_names)} -> {len(selected_set)} features")
1028
+ print(f" Top 30 features:")
1029
+ for i, (fname, imp) in enumerate(fi_pairs[:30]):
1030
+ marker = ""
1031
+ if 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1032
+ elif '_resid' in fname: marker = " [R]"
1033
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname or '_x_ed' in fname: marker = " [I]"
1034
+ elif '_school_pctile' in fname: marker = " [P]"
1035
+ elif 'school_base_rate' in fname: marker = " [S]"
1036
+ elif 'ed_boost' in fname: marker = " [ED]"
1037
+ elif 'ps2_' in fname: marker = " [PS2]"
1038
+ elif 'ps5_' in fname or fname.startswith('ps_') and 'bert' not in fname: marker = " [PS_V5]"
1039
+ elif 'supp_' in fname: marker = " [SUPP_V2]"
1040
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1041
+ print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
1042
+
1043
+ # Count V9 new features in top 50
1044
+ v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
1045
+ ps5_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'ps5_' in f or (f.startswith('ps_') and 'bert' not in f and f in ps5_feature_cols))
1046
+ supp_v2_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'supp_' in f)
1047
+ print(f" PS V5 features in top 50: {ps5_in_top50}")
1048
+ print(f" Supp V2 features in top 50: {supp_v2_in_top50}")
1049
+ bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
1050
+ print(f"\n V9 activity label features in top 50: {v9_in_top50}")
1051
+ print(f" act_bert_pca features in top 50: {bert_in_top50}")
1052
+
1053
+ # ============================================================
1054
+ # 8. TEMPORAL VALIDATION
1055
+ # ============================================================
1056
+ print(f"\n{'='*70}")
1057
+ print(f" TEMPORAL VALIDATION (2020-2023 -> 2024)")
1058
+ print(f"{'='*70}")
1059
+
1060
+ mask_train_temporal = df_base['year'].isin([2020, 2021, 2022, 2023])
1061
+ mask_test_temporal = df_base['year'] == 2024
1062
+
1063
+ temporal_results = {}
1064
+ if mask_test_temporal.sum() > 0:
1065
+ df_temporal, feat_cols_t, cat_cols_t, cat_idx_t = add_residualized_features(
1066
+ df_base, mask_train_temporal, cat_cols, selected_features=selected_set)
1067
+
1068
+ X_t = df_temporal[feat_cols_t].copy()
1069
+ for c in cat_cols_t:
1070
+ if c in X_t.columns:
1071
+ X_t[c] = X_t[c].astype(int)
1072
+
1073
+ X_tr_t = X_t[mask_train_temporal]
1074
+ X_te_t = X_t[mask_test_temporal]
1075
+ y_tr_t = y[mask_train_temporal]
1076
+ y_te_t = y[mask_test_temporal]
1077
+
1078
+ X_tr_t_filled = X_tr_t.fillna(-999)
1079
+ X_te_t_filled = X_te_t.fillna(-999)
1080
+
1081
+ print(f" Train: {len(X_tr_t)}, Test: {len(X_te_t)}, Features: {len(feat_cols_t)}")
1082
+
1083
+ for seed in SEEDS:
1084
+ cb_t = CatBoostClassifier(
1085
+ iterations=1500, depth=8, learning_rate=0.02,
1086
+ l2_leaf_reg=10, random_seed=seed, verbose=0,
1087
+ cat_features=cat_idx_t, eval_metric='AUC',
1088
+ early_stopping_rounds=100, min_data_in_leaf=15,
1089
+ random_strength=2, bagging_temperature=0.8)
1090
+ pool_tr = Pool(X_tr_t, y_tr_t, cat_features=cat_idx_t)
1091
+ pool_te = Pool(X_te_t, y_te_t, cat_features=cat_idx_t)
1092
+ cb_t.fit(pool_tr, eval_set=pool_te, verbose=0)
1093
+ cb_pred = cb_t.predict_proba(Pool(X_te_t, cat_features=cat_idx_t))[:, 1]
1094
+ del cb_t; gc.collect()
1095
+
1096
+ lgb_tr = lgb.Dataset(X_tr_t_filled.values, y_tr_t, categorical_feature=cat_idx_t)
1097
+ lgb_va = lgb.Dataset(X_te_t_filled.values, y_te_t, categorical_feature=cat_idx_t, reference=lgb_tr)
1098
+ lgb_params = {
1099
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1100
+ 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 7,
1101
+ 'min_child_samples': 30, 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1102
+ 'feature_fraction': 0.6, 'bagging_fraction': 0.75, 'bagging_freq': 5,
1103
+ 'seed': seed
1104
+ }
1105
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=2000,
1106
+ valid_sets=[lgb_va],
1107
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1108
+ lgb_pred = lgb_model.predict(X_te_t_filled.values)
1109
+ del lgb_model; gc.collect()
1110
+
1111
+ dtrain = xgb.DMatrix(X_tr_t_filled.values, label=y_tr_t, enable_categorical=False)
1112
+ dtest = xgb.DMatrix(X_te_t_filled.values, label=y_te_t, enable_categorical=False)
1113
+ xgb_params = {
1114
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1115
+ 'max_depth': 7, 'learning_rate': 0.02,
1116
+ 'subsample': 0.75, 'colsample_bytree': 0.6,
1117
+ 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1118
+ 'min_child_weight': 7,
1119
+ 'seed': seed, 'verbosity': 0
1120
+ }
1121
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=2000,
1122
+ evals=[(dtest, 'val')],
1123
+ early_stopping_rounds=100, verbose_eval=False)
1124
+ xgb_pred = xgb_model.predict(dtest)
1125
+ del xgb_model, dtrain, dtest; gc.collect()
1126
+
1127
+ blend = 0.45 * cb_pred + 0.20 * lgb_pred + 0.35 * xgb_pred
1128
+ temporal_results[seed] = {
1129
+ 'cb': float(roc_auc_score(y_te_t, cb_pred)),
1130
+ 'lgb': float(roc_auc_score(y_te_t, lgb_pred)),
1131
+ 'xgb': float(roc_auc_score(y_te_t, xgb_pred)),
1132
+ 'blend': float(roc_auc_score(y_te_t, blend))
1133
+ }
1134
+ print(f" Seed {seed}: CB={temporal_results[seed]['cb']:.4f} LGB={temporal_results[seed]['lgb']:.4f} XGB={temporal_results[seed]['xgb']:.4f} Blend={temporal_results[seed]['blend']:.4f}")
1135
+
1136
+ avg_temporal = np.mean([v['blend'] for v in temporal_results.values()])
1137
+ print(f"\n AVG Temporal Blend: {avg_temporal:.4f}")
1138
+ print(f" Delta vs V37.3: {avg_temporal - 0.8410:+.4f}")
1139
+ print(f" Delta vs V38.2-PRO-V4: {avg_temporal - 0.8555:+.4f}")
1140
+ print(f" Delta vs V38.2-PRO-V8: {avg_temporal - 0.8548:+.4f}")
1141
+ print(f" Delta vs V38.2-PRO-V9: {avg_temporal - 0.8594:+.4f}")
1142
+ print(f" Delta vs V38.2-PRO-V10: {avg_temporal - 0.8631:+.4f}")
1143
+
1144
+ del df_temporal, X_t; gc.collect()
1145
+ else:
1146
+ avg_temporal = 0.0
1147
+
1148
+ # ============================================================
1149
+ # 9. STAGE 2: MULTI-SEED GROUPKFOLD
1150
+ # ============================================================
1151
+ print(f"\n{'='*70}")
1152
+ print(f" STAGE 2: MULTI-SEED GROUPKFOLD ({len(SEEDS)} seeds x {N_FOLDS} folds)")
1153
+ print(f"{'='*70}")
1154
+
1155
+ all_cb_oof = []
1156
+ all_lgb_oof = []
1157
+ all_xgb_oof = []
1158
+ all_fi = []
1159
+ feature_cols_final = None
1160
+
1161
+ for seed_idx, seed in enumerate(SEEDS):
1162
+ print(f"\n --- Seed {seed} ({seed_idx+1}/{len(SEEDS)}) ---")
1163
+ gkf = GroupKFold(n_splits=N_FOLDS)
1164
+ cb_oof = np.zeros(len(df_base))
1165
+ lgb_oof = np.zeros(len(df_base))
1166
+ xgb_oof = np.zeros(len(df_base))
1167
+
1168
+ for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_base, y, groups)):
1169
+ train_mask = pd.Series(False, index=df_base.index)
1170
+ train_mask.iloc[tr_idx] = True
1171
+
1172
+ df_fold, feat_cols_f, cat_cols_f, cat_idx_f = add_residualized_features(
1173
+ df_base, train_mask, cat_cols, selected_features=selected_set)
1174
+
1175
+ if feature_cols_final is None:
1176
+ feature_cols_final = feat_cols_f
1177
+ print(f" Total features after selection: {len(feat_cols_f)}")
1178
+
1179
+ X_fold = df_fold[feat_cols_f].copy()
1180
+ for c in cat_cols_f:
1181
+ if c in X_fold.columns:
1182
+ X_fold[c] = X_fold[c].astype(int)
1183
+
1184
+ X_tr_df = X_fold.iloc[tr_idx]
1185
+ X_va_df = X_fold.iloc[va_idx]
1186
+ y_tr = y[tr_idx]
1187
+ y_va = y[va_idx]
1188
+
1189
+ cb = CatBoostClassifier(
1190
+ iterations=2000, depth=8, learning_rate=0.02,
1191
+ l2_leaf_reg=10, random_seed=seed, verbose=0,
1192
+ cat_features=cat_idx_f, eval_metric='AUC',
1193
+ early_stopping_rounds=100, min_data_in_leaf=15,
1194
+ random_strength=2, bagging_temperature=0.8)
1195
+ pool_tr = Pool(X_tr_df, y_tr, cat_features=cat_idx_f)
1196
+ pool_va = Pool(X_va_df, y_va, cat_features=cat_idx_f)
1197
+ cb.fit(pool_tr, eval_set=pool_va, verbose=0)
1198
+ cb_pred = cb.predict_proba(Pool(X_va_df, cat_features=cat_idx_f))[:, 1]
1199
+ cb_oof[va_idx] = cb_pred
1200
+
1201
+ if fold == N_FOLDS - 1:
1202
+ all_fi.append(cb.get_feature_importance())
1203
+ del cb, pool_tr, pool_va; gc.collect()
1204
+
1205
+ X_tr_filled = X_tr_df.fillna(-999).values
1206
+ X_va_filled = X_va_df.fillna(-999).values
1207
+
1208
+ lgb_tr = lgb.Dataset(X_tr_filled, y_tr, categorical_feature=cat_idx_f)
1209
+ lgb_va_ds = lgb.Dataset(X_va_filled, y_va, categorical_feature=cat_idx_f, reference=lgb_tr)
1210
+ lgb_params = {
1211
+ 'objective': 'binary', 'metric': 'auc', 'verbosity': -1,
1212
+ 'learning_rate': 0.02, 'num_leaves': 63, 'max_depth': 7,
1213
+ 'min_child_samples': 30, 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1214
+ 'feature_fraction': 0.6, 'bagging_fraction': 0.75, 'bagging_freq': 5,
1215
+ 'seed': seed
1216
+ }
1217
+ lgb_model = lgb.train(lgb_params, lgb_tr, num_boost_round=2000,
1218
+ valid_sets=[lgb_va_ds],
1219
+ callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
1220
+ lgb_pred = lgb_model.predict(X_va_filled)
1221
+ lgb_oof[va_idx] = lgb_pred
1222
+ del lgb_model; gc.collect()
1223
+
1224
+ dtrain = xgb.DMatrix(X_tr_filled, label=y_tr)
1225
+ dval = xgb.DMatrix(X_va_filled, label=y_va)
1226
+ xgb_params = {
1227
+ 'objective': 'binary:logistic', 'eval_metric': 'auc',
1228
+ 'max_depth': 7, 'learning_rate': 0.02,
1229
+ 'subsample': 0.75, 'colsample_bytree': 0.6,
1230
+ 'reg_alpha': 0.5, 'reg_lambda': 3.0,
1231
+ 'min_child_weight': 7,
1232
+ 'seed': seed, 'verbosity': 0
1233
+ }
1234
+ xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=2000,
1235
+ evals=[(dval, 'val')],
1236
+ early_stopping_rounds=100, verbose_eval=False)
1237
+ xgb_pred = xgb_model.predict(dval)
1238
+ xgb_oof[va_idx] = xgb_pred
1239
+ del xgb_model, dtrain, dval, df_fold, X_fold; gc.collect()
1240
+
1241
+ if (fold + 1) % 5 == 0:
1242
+ print(f" Fold {fold+1}/{N_FOLDS} done")
1243
+
1244
+ cb_auc = roc_auc_score(y, cb_oof)
1245
+ lgb_auc = roc_auc_score(y, lgb_oof)
1246
+ xgb_auc = roc_auc_score(y, xgb_oof)
1247
+ print(f" CB: {cb_auc:.4f} LGB: {lgb_auc:.4f} XGB: {xgb_auc:.4f}")
1248
+
1249
+ all_cb_oof.append(cb_oof)
1250
+ all_lgb_oof.append(lgb_oof)
1251
+ all_xgb_oof.append(xgb_oof)
1252
+
1253
+ # ============================================================
1254
+ # 10. ENSEMBLE & BLEND
1255
+ # ============================================================
1256
+ print(f"\n{'='*70}")
1257
+ print(f" ENSEMBLE RESULTS (MODE={EXPERIMENT_MODE})")
1258
+ print(f"{'='*70}")
1259
+
1260
+ cb_avg = np.mean(all_cb_oof, axis=0)
1261
+ lgb_avg = np.mean(all_lgb_oof, axis=0)
1262
+ xgb_avg = np.mean(all_xgb_oof, axis=0)
1263
+
1264
+ cb_final_auc = roc_auc_score(y, cb_avg)
1265
+ lgb_final_auc = roc_auc_score(y, lgb_avg)
1266
+ xgb_final_auc = roc_auc_score(y, xgb_avg)
1267
+
1268
+ print(f" CB {len(SEEDS)}-seed avg: {cb_final_auc:.4f}")
1269
+ print(f" LGB {len(SEEDS)}-seed avg: {lgb_final_auc:.4f}")
1270
+ print(f" XGB {len(SEEDS)}-seed avg: {xgb_final_auc:.4f}")
1271
+
1272
+ # V11: Finer granularity weight search (0.02 step)
1273
+ best_auc = 0
1274
+ best_weights = (0.45, 0.20, 0.35)
1275
+ for w_cb in np.arange(0.30, 0.65, 0.02):
1276
+ for w_lgb in np.arange(0.05, 0.40, 0.02):
1277
+ w_xgb = 1.0 - w_cb - w_lgb
1278
+ if w_xgb < 0.05 or w_xgb > 0.55: continue
1279
+ blend = w_cb * cb_avg + w_lgb * lgb_avg + w_xgb * xgb_avg
1280
+ auc = roc_auc_score(y, blend)
1281
+ if auc > best_auc:
1282
+ best_auc = auc
1283
+ best_weights = (w_cb, w_lgb, w_xgb)
1284
+
1285
+ print(f"\n Best 3-model blend: {best_auc:.4f}")
1286
+ print(f" Delta vs V37.3: {best_auc - 0.8697:+.4f}")
1287
+ print(f" Delta vs V38.2-PRO-V4: {best_auc - 0.8758:+.4f}")
1288
+ print(f" Delta vs V38.2-PRO-V8: {best_auc - 0.8753:+.4f}")
1289
+ print(f" Delta vs V38.2-PRO-V9: {best_auc - 0.8772:+.4f}")
1290
+ print(f" Delta vs V38.2-PRO-V10: {best_auc - 0.8784:+.4f}")
1291
+ print(f" Weights: CB={best_weights[0]:.2f} LGB={best_weights[1]:.2f} XGB={best_weights[2]:.2f}")
1292
+
1293
+ rank_blend = (rankdata(cb_avg) + rankdata(lgb_avg) + rankdata(xgb_avg)) / 3
1294
+ rank_auc = roc_auc_score(y, rank_blend)
1295
+ print(f" Rank blend: {rank_auc:.4f}")
1296
+
1297
+ final_blend_prob = best_weights[0] * cb_avg + best_weights[1] * lgb_avg + best_weights[2] * xgb_avg
1298
+ final_auc = roc_auc_score(y, final_blend_prob)
1299
+ final_brier = brier_score_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1300
+ final_logloss = log_loss(y, np.clip(final_blend_prob, 1e-7, 1-1e-7))
1301
+
1302
+ print(f"\n FINAL METRICS:")
1303
+ print(f" AUC: {final_auc:.4f}")
1304
+ print(f" Brier: {final_brier:.4f}")
1305
+ print(f" LogLoss: {final_logloss:.4f}")
1306
+
1307
+ # ============================================================
1308
+ # 11. FEATURE IMPORTANCE
1309
+ # ============================================================
1310
+ print(f"\n{'='*70}")
1311
+ print(f" FEATURE IMPORTANCE (MODE={EXPERIMENT_MODE})")
1312
+ print(f"{'='*70}")
1313
+
1314
+ if feature_cols_final and all_fi:
1315
+ avg_fi = np.mean(all_fi, axis=0)
1316
+ fi_pairs = sorted(zip(feature_cols_final, avg_fi), key=lambda x: -x[1])
1317
+
1318
+ print(f" {'Rank':<5s} {'Feature':<55s} {'Importance':>10s}")
1319
+ print(f" {'-'*5} {'-'*55} {'-'*10}")
1320
+ for i, (fname, imp) in enumerate(fi_pairs[:50]):
1321
+ marker = ""
1322
+ if 'act_label' in fname or fname in ACT_LABEL_COLS: marker = " [ACT_LABEL_V9]"
1323
+ elif '_resid' in fname: marker = " [RESID]"
1324
+ elif '_x_school_rate' in fname or '_resid_x_rate' in fname: marker = " [INTERACT]"
1325
+ elif '_school_pctile' in fname: marker = " [PCTILE]"
1326
+ elif fname.startswith('school_base_rate'): marker = " [SCHOOL_RATE]"
1327
+ elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1328
+ elif 'ps2_' in fname: marker = " [PS2]"
1329
+ print(f" {i+1:<5d} {fname:<55s} {imp:>10.2f}{marker}")
1330
+
1331
+ v9_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_label' in f or f in ACT_LABEL_COLS)
1332
+ bert_in_top30 = sum(1 for f, _ in fi_pairs[:30] if 'act_bert_pca' in f)
1333
+ print(f"\n V9 activity label features in top 30: {v9_in_top30}")
1334
+ print(f" act_bert_pca features in top 30: {bert_in_top30}")
1335
+
1336
+ # ============================================================
1337
+ # 12. SAVE RESULTS
1338
+ # ============================================================
1339
+ elapsed = time.time() - start_time
1340
+
1341
+ results = {
1342
+ 'version': f'V38.2-pro-v11-mode-{EXPERIMENT_MODE}',
1343
+ 'experiment_mode': EXPERIMENT_MODE,
1344
+ 'mode_description': mode_desc.get(EXPERIMENT_MODE, 'UNKNOWN'),
1345
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
1346
+ 'elapsed_minutes': elapsed / 60,
1347
+ 'changes': [
1348
+ 'All V10 features carried forward',
1349
+ f'EXPERIMENT MODE: {EXPERIMENT_MODE} - {mode_desc.get(EXPERIMENT_MODE)}',
1350
+ 'NEW #22: Aggressive feature pruning (150 -> 100)',
1351
+ 'NEW #23: Hyperparameter tuning (depth 8, lr 0.02, stronger reg)',
1352
+ 'NEW #24: 3 new domain-specific interaction features',
1353
+ 'NEW #25: Finer ensemble weight search (0.02 step)',
1354
+ ],
1355
+ 'comparison': {
1356
+ 'v37_3': {'auc': 0.8697, 'temporal_auc': 0.8410},
1357
+ 'v38_2_pro_v4': {'auc': 0.8758, 'temporal_auc': 0.8555},
1358
+ 'v38_2_pro_v8': {'auc': 0.8753, 'temporal_auc': 0.8548},
1359
+ 'v38_2_pro_v9': {'auc': 0.8772, 'temporal_auc': 0.8594},
1360
+ 'v38_2_pro_v10': {'auc': 0.8784, 'temporal_auc': 0.8631},
1361
+ },
1362
+ 'temporal_validation': {
1363
+ 'per_seed': temporal_results,
1364
+ 'avg_blend': float(avg_temporal),
1365
+ },
1366
+ 'groupkfold': {
1367
+ 'best_3model_blend': float(best_auc),
1368
+ 'best_weights': [float(w) for w in best_weights],
1369
+ 'rank_blend': float(rank_auc),
1370
+ },
1371
+ 'final_metrics': {
1372
+ 'auc': float(final_auc),
1373
+ 'brier': float(final_brier),
1374
+ 'logloss': float(final_logloss),
1375
+ },
1376
+ 'n_features': len(feature_cols_final) if feature_cols_final else 0,
1377
+ 'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
1378
+ }
1379
+
1380
+ suffix = f'_mode_{EXPERIMENT_MODE}'
1381
+ with open(os.path.join(OUTPUT_DIR, f'v38_2_pro_v11{suffix}_results.json'), 'w') as f:
1382
+ json.dump(results, f, indent=2)
1383
+
1384
+ oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
1385
+ oof_df['cb_pred'] = cb_avg
1386
+ oof_df['lgb_pred'] = lgb_avg
1387
+ oof_df['xgb_pred'] = xgb_avg
1388
+ oof_df['final_pred'] = final_blend_prob
1389
+ oof_df.to_csv(os.path.join(OUTPUT_DIR, f'v38_2_pro_v11{suffix}_oof_predictions.csv'), index=False)
1390
+
1391
+ print(f"\n{'='*70}")
1392
+ print(f" V38.2-PRO-V11 MODE={EXPERIMENT_MODE} COMPLETE")
1393
+ print(f" Total time: {elapsed/60:.1f} minutes")
1394
+ print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'}")
1395
+ print(f" GroupKFold AUC: {final_auc:.4f}")
1396
+ print(f" Temporal AUC: {avg_temporal:.4f}")
1397
+ print(f"{'='*70}")