Upload train_v38_2_pro.py with huggingface_hub
Browse files- train_v38_2_pro.py +5 -4
train_v38_2_pro.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
-
V38.2-PRO
|
|
|
|
| 3 |
====================================================================
|
| 4 |
Improvements over V38.2-resid:
|
| 5 |
1. Two-stage training: Stage 1 = feature importance, Stage 2 = selected features only
|
|
@@ -386,7 +387,7 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
|
|
| 386 |
|
| 387 |
all_feat = list(set(num_cols + cat_cols))
|
| 388 |
feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
|
| 389 |
-
for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0']:
|
| 390 |
if remove in feature_cols:
|
| 391 |
feature_cols.remove(remove)
|
| 392 |
|
|
@@ -833,7 +834,7 @@ results = {
|
|
| 833 |
'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
|
| 834 |
}
|
| 835 |
|
| 836 |
-
with open(os.path.join(OUTPUT_DIR, '
|
| 837 |
json.dump(results, f, indent=2)
|
| 838 |
|
| 839 |
oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
|
|
@@ -844,7 +845,7 @@ oof_df['final_pred'] = final_blend_prob
|
|
| 844 |
oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_oof_predictions.csv'), index=False)
|
| 845 |
|
| 846 |
print(f"\n{'='*70}")
|
| 847 |
-
print(f" V38.2-PRO
|
| 848 |
print(f" Total time: {elapsed/60:.1f} minutes")
|
| 849 |
print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'} (selected from {len(all_feature_names)})")
|
| 850 |
print(f" GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-resid: 0.8682)")
|
|
|
|
| 1 |
"""
|
| 2 |
+
V38.2-PRO-ABLATION: No portfolio_size
|
| 3 |
+
Original: V38.2-PRO MODEL - Optimized Residualization + Feature Selection
|
| 4 |
====================================================================
|
| 5 |
Improvements over V38.2-resid:
|
| 6 |
1. Two-stage training: Stage 1 = feature importance, Stage 2 = selected features only
|
|
|
|
| 387 |
|
| 388 |
all_feat = list(set(num_cols + cat_cols))
|
| 389 |
feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
|
| 390 |
+
for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size']:
|
| 391 |
if remove in feature_cols:
|
| 392 |
feature_cols.remove(remove)
|
| 393 |
|
|
|
|
| 834 |
'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
|
| 835 |
}
|
| 836 |
|
| 837 |
+
with open(os.path.join(OUTPUT_DIR, 'v38_2_pro_ablation_results.json'), 'w') as f:
|
| 838 |
json.dump(results, f, indent=2)
|
| 839 |
|
| 840 |
oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
|
|
|
|
| 845 |
oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_oof_predictions.csv'), index=False)
|
| 846 |
|
| 847 |
print(f"\n{'='*70}")
|
| 848 |
+
print(f" V38.2-PRO ABLATION (no portfolio_size) COMPLETE")
|
| 849 |
print(f" Total time: {elapsed/60:.1f} minutes")
|
| 850 |
print(f" Features: {len(feature_cols_final) if feature_cols_final else 'N/A'} (selected from {len(all_feature_names)})")
|
| 851 |
print(f" GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-resid: 0.8682)")
|