Spaces:

catninja123
/

v38-2-bare-model

Paused

App Files Files Community

catninja123 commited on Mar 14

Commit

8a960b8

verified ·

1 Parent(s): 0402746

Upload train_v38_2_pro_v2.py with huggingface_hub

Browse files

Files changed (1) hide show

train_v38_2_pro_v2.py +18 -23

train_v38_2_pro_v2.py CHANGED Viewed

@@ -83,7 +83,7 @@ if os.path.exists(RAW_CSV):
     import re
     df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
     for _, row in df_raw_rounds.iterrows():
-        sid = str(row['student_id'])
         summary = str(row.get('school_results_summary', ''))
         # Parse: School(Round Type):Result | ...
         entries = summary.split(' | ')
@@ -111,7 +111,7 @@ print(f"Admit rate: {df[TARGET].mean():.3f}")
 # ============================================================
 def get_detailed_round(row):
     """Map each row to detailed round: EA/ED1/ED2/REA/RD"""
-    sid = str(row.get('student_id', ''))
     school = str(row.get('school', ''))
     # Try to find in round_lookup
@@ -390,33 +390,28 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
     df['school_n_admits'] = df['school_n_admits'].fillna(0)
     # Step 1b: ED boost per school (NEW)
-    # For each school, compute ED admit rate - RD admit rate from training data
-    ed_stats = train_df[train_df['round_cat'].isin([
-        train_df['round_cat'].unique()[train_df.apply(
-            lambda r: r['is_ed1'] == 1, axis=1).values][0]
-    ] if any(train_df['is_ed1'] == 1) else [])].groupby('school')[TARGET].mean() if 'is_ed1' in train_df.columns else pd.Series(dtype=float)
-    # Simpler approach: compute ED boost per school from training data
-    round_school = train_df.groupby(['school', 'is_ed1'])[TARGET].agg(['mean', 'count']).reset_index()
-    ed1_rates = round_school[round_school['is_ed1'] == 1].set_index('school')['mean']
-    rd_rates = train_df[~train_df['round_cat'].isin(['ED1', 'ED2', 'EA', 'REA', 'EA2']) if 'round_cat' in train_df.columns else train_df['is_ed1'] == 0].groupby('school')[TARGET].mean()
     ed_boost_map = {}
-    for school in ed1_rates.index:
-        if school in rd_rates.index:
-            ed_boost_map[school] = ed1_rates[school] - rd_rates[school]
     df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
     # Also compute ED2 boost
-    if 'is_ed2' in train_df.columns:
-        round_school_ed2 = train_df.groupby(['school', 'is_ed2'])[TARGET].agg(['mean', 'count']).reset_index()
-        ed2_rates = round_school_ed2[round_school_ed2['is_ed2'] == 1].set_index('school')['mean']
-        ed2_boost_map = {}
-        for school in ed2_rates.index:
-            if school in rd_rates.index:
-                ed2_boost_map[school] = ed2_rates[school] - rd_rates[school]
-        df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
     # Step 2: Residualize student features
     student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
@@ -460,7 +455,7 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
         df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
         interaction_cols.append('ed1_x_ed_boost')
     if 'is_ed2' in df.columns:
-        df['ed2_x_ed2_boost'] = df['is_ed2'] * df.get('school_ed2_boost', pd.Series(0, index=df.index))
         interaction_cols.append('ed2_x_ed2_boost')
     # Step 4: Student percentile within school

     import re
     df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
     for _, row in df_raw_rounds.iterrows():
+        sid = str(row['student_id']).replace('.0', '')
         summary = str(row.get('school_results_summary', ''))
         # Parse: School(Round Type):Result | ...
         entries = summary.split(' | ')
 # ============================================================
 def get_detailed_round(row):
     """Map each row to detailed round: EA/ED1/ED2/REA/RD"""
+    sid = str(row.get('student_id', '')).replace('.0', '')
     school = str(row.get('school', ''))
     # Try to find in round_lookup
     df['school_n_admits'] = df['school_n_admits'].fillna(0)
     # Step 1b: ED boost per school (NEW)
+    # Compute ED1 admit rate per school
+    ed1_mask = train_df['is_ed1'] == 1
+    rd_mask = train_df['is_early'] == 0  # RD = not early
+    ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
+    rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
     ed_boost_map = {}
+    for school in ed1_school_rates.index:
+        if school in rd_school_rates.index:
+            ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
     df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
     # Also compute ED2 boost
+    ed2_mask = train_df['is_ed2'] == 1
+    ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
+    ed2_boost_map = {}
+    for school in ed2_school_rates.index:
+        if school in rd_school_rates.index:
+            ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
+    df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
     # Step 2: Residualize student features
     student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
         df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
         interaction_cols.append('ed1_x_ed_boost')
     if 'is_ed2' in df.columns:
+        df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
         interaction_cols.append('ed2_x_ed2_boost')
     # Step 4: Student percentile within school