catninja123 commited on
Commit
8a960b8
·
verified ·
1 Parent(s): 0402746

Upload train_v38_2_pro_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v2.py +18 -23
train_v38_2_pro_v2.py CHANGED
@@ -83,7 +83,7 @@ if os.path.exists(RAW_CSV):
83
  import re
84
  df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
85
  for _, row in df_raw_rounds.iterrows():
86
- sid = str(row['student_id'])
87
  summary = str(row.get('school_results_summary', ''))
88
  # Parse: School(Round Type):Result | ...
89
  entries = summary.split(' | ')
@@ -111,7 +111,7 @@ print(f"Admit rate: {df[TARGET].mean():.3f}")
111
  # ============================================================
112
  def get_detailed_round(row):
113
  """Map each row to detailed round: EA/ED1/ED2/REA/RD"""
114
- sid = str(row.get('student_id', ''))
115
  school = str(row.get('school', ''))
116
 
117
  # Try to find in round_lookup
@@ -390,33 +390,28 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
390
  df['school_n_admits'] = df['school_n_admits'].fillna(0)
391
 
392
  # Step 1b: ED boost per school (NEW)
393
- # For each school, compute ED admit rate - RD admit rate from training data
394
- ed_stats = train_df[train_df['round_cat'].isin([
395
- train_df['round_cat'].unique()[train_df.apply(
396
- lambda r: r['is_ed1'] == 1, axis=1).values][0]
397
- ] if any(train_df['is_ed1'] == 1) else [])].groupby('school')[TARGET].mean() if 'is_ed1' in train_df.columns else pd.Series(dtype=float)
398
 
399
- # Simpler approach: compute ED boost per school from training data
400
- round_school = train_df.groupby(['school', 'is_ed1'])[TARGET].agg(['mean', 'count']).reset_index()
401
- ed1_rates = round_school[round_school['is_ed1'] == 1].set_index('school')['mean']
402
- rd_rates = train_df[~train_df['round_cat'].isin(['ED1', 'ED2', 'EA', 'REA', 'EA2']) if 'round_cat' in train_df.columns else train_df['is_ed1'] == 0].groupby('school')[TARGET].mean()
403
 
404
  ed_boost_map = {}
405
- for school in ed1_rates.index:
406
- if school in rd_rates.index:
407
- ed_boost_map[school] = ed1_rates[school] - rd_rates[school]
408
 
409
  df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
410
 
411
  # Also compute ED2 boost
412
- if 'is_ed2' in train_df.columns:
413
- round_school_ed2 = train_df.groupby(['school', 'is_ed2'])[TARGET].agg(['mean', 'count']).reset_index()
414
- ed2_rates = round_school_ed2[round_school_ed2['is_ed2'] == 1].set_index('school')['mean']
415
- ed2_boost_map = {}
416
- for school in ed2_rates.index:
417
- if school in rd_rates.index:
418
- ed2_boost_map[school] = ed2_rates[school] - rd_rates[school]
419
- df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
420
 
421
  # Step 2: Residualize student features
422
  student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
@@ -460,7 +455,7 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
460
  df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
461
  interaction_cols.append('ed1_x_ed_boost')
462
  if 'is_ed2' in df.columns:
463
- df['ed2_x_ed2_boost'] = df['is_ed2'] * df.get('school_ed2_boost', pd.Series(0, index=df.index))
464
  interaction_cols.append('ed2_x_ed2_boost')
465
 
466
  # Step 4: Student percentile within school
 
83
  import re
84
  df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
85
  for _, row in df_raw_rounds.iterrows():
86
+ sid = str(row['student_id']).replace('.0', '')
87
  summary = str(row.get('school_results_summary', ''))
88
  # Parse: School(Round Type):Result | ...
89
  entries = summary.split(' | ')
 
111
  # ============================================================
112
  def get_detailed_round(row):
113
  """Map each row to detailed round: EA/ED1/ED2/REA/RD"""
114
+ sid = str(row.get('student_id', '')).replace('.0', '')
115
  school = str(row.get('school', ''))
116
 
117
  # Try to find in round_lookup
 
390
  df['school_n_admits'] = df['school_n_admits'].fillna(0)
391
 
392
  # Step 1b: ED boost per school (NEW)
393
+ # Compute ED1 admit rate per school
394
+ ed1_mask = train_df['is_ed1'] == 1
395
+ rd_mask = train_df['is_early'] == 0 # RD = not early
 
 
396
 
397
+ ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
398
+ rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
 
 
399
 
400
  ed_boost_map = {}
401
+ for school in ed1_school_rates.index:
402
+ if school in rd_school_rates.index:
403
+ ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
404
 
405
  df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
406
 
407
  # Also compute ED2 boost
408
+ ed2_mask = train_df['is_ed2'] == 1
409
+ ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
410
+ ed2_boost_map = {}
411
+ for school in ed2_school_rates.index:
412
+ if school in rd_school_rates.index:
413
+ ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
414
+ df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
 
415
 
416
  # Step 2: Residualize student features
417
  student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
 
455
  df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
456
  interaction_cols.append('ed1_x_ed_boost')
457
  if 'is_ed2' in df.columns:
458
+ df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
459
  interaction_cols.append('ed2_x_ed2_boost')
460
 
461
  # Step 4: Student percentile within school