Spaces:

catninja123
/

v38-2-bare-model

Paused

catninja123 commited on about 1 month ago

Commit

ebffd3c

verified ·

1 Parent(s): a90e56a

Upload train_v38_2_pro.py with huggingface_hub

Files changed (1) hide show

train_v38_2_pro.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-V38.2-PRO MODEL - Optimized Residualization + Feature Selection
 ====================================================================
 Improvements over V38.2-resid:
 1. Two-stage training: Stage 1 = feature importance, Stage 2 = selected features only
@@ -386,7 +387,7 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
     all_feat = list(set(num_cols + cat_cols))
     feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
-    for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0']:
         if remove in feature_cols:
             feature_cols.remove(remove)
@@ -833,7 +834,7 @@ results = {
     'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
 }
-with open(os.path.join(OUTPUT_DIR, 'v38_2_pro_results.json'), 'w') as f:
     json.dump(results, f, indent=2)
 oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
@@ -844,7 +845,7 @@ oof_df['final_pred'] = final_blend_prob
 oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_oof_predictions.csv'), index=False)
 print(f"\n{'='*70}")
-print(f"  V38.2-PRO MODEL COMPLETE")
 print(f"  Total time: {elapsed/60:.1f} minutes")
 print(f"  Features: {len(feature_cols_final) if feature_cols_final else 'N/A'} (selected from {len(all_feature_names)})")
 print(f"  GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-resid: 0.8682)")

 """
+V38.2-PRO-ABLATION: No portfolio_size
+Original: V38.2-PRO MODEL - Optimized Residualization + Feature Selection
 ====================================================================
 Improvements over V38.2-resid:
 1. Two-stage training: Stage 1 = feature importance, Stage 2 = selected features only
     all_feat = list(set(num_cols + cat_cols))
     feature_cols = list(dict.fromkeys([c for c in all_feat if c in df.columns]))
+    for remove in [TARGET, 'student_id', 'year', 'sid_str', 'Unnamed: 0', 'portfolio_size']:
         if remove in feature_cols:
             feature_cols.remove(remove)
     'feature_importance': [[f, float(i)] for f, i in fi_pairs[:50]] if feature_cols_final and all_fi else [],
 }
+with open(os.path.join(OUTPUT_DIR, 'v38_2_pro_ablation_results.json'), 'w') as f:
     json.dump(results, f, indent=2)
 oof_df = df_base[['student_id', 'school', 'year', TARGET]].copy()
 oof_df.to_csv(os.path.join(OUTPUT_DIR, 'v38_2_pro_oof_predictions.csv'), index=False)
 print(f"\n{'='*70}")
+print(f"  V38.2-PRO ABLATION (no portfolio_size) COMPLETE")
 print(f"  Total time: {elapsed/60:.1f} minutes")
 print(f"  Features: {len(feature_cols_final) if feature_cols_final else 'N/A'} (selected from {len(all_feature_names)})")
 print(f"  GroupKFold AUC: {final_auc:.4f} (V37.3: 0.8697, V38.2-resid: 0.8682)")