Upload train_v38_2_pro_v2.py with huggingface_hub
Browse files- train_v38_2_pro_v2.py +18 -23
train_v38_2_pro_v2.py
CHANGED
|
@@ -83,7 +83,7 @@ if os.path.exists(RAW_CSV):
|
|
| 83 |
import re
|
| 84 |
df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
|
| 85 |
for _, row in df_raw_rounds.iterrows():
|
| 86 |
-
sid = str(row['student_id'])
|
| 87 |
summary = str(row.get('school_results_summary', ''))
|
| 88 |
# Parse: School(Round Type):Result | ...
|
| 89 |
entries = summary.split(' | ')
|
|
@@ -111,7 +111,7 @@ print(f"Admit rate: {df[TARGET].mean():.3f}")
|
|
| 111 |
# ============================================================
|
| 112 |
def get_detailed_round(row):
|
| 113 |
"""Map each row to detailed round: EA/ED1/ED2/REA/RD"""
|
| 114 |
-
sid = str(row.get('student_id', ''))
|
| 115 |
school = str(row.get('school', ''))
|
| 116 |
|
| 117 |
# Try to find in round_lookup
|
|
@@ -390,33 +390,28 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
|
|
| 390 |
df['school_n_admits'] = df['school_n_admits'].fillna(0)
|
| 391 |
|
| 392 |
# Step 1b: ED boost per school (NEW)
|
| 393 |
-
#
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
lambda r: r['is_ed1'] == 1, axis=1).values][0]
|
| 397 |
-
] if any(train_df['is_ed1'] == 1) else [])].groupby('school')[TARGET].mean() if 'is_ed1' in train_df.columns else pd.Series(dtype=float)
|
| 398 |
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
ed1_rates = round_school[round_school['is_ed1'] == 1].set_index('school')['mean']
|
| 402 |
-
rd_rates = train_df[~train_df['round_cat'].isin(['ED1', 'ED2', 'EA', 'REA', 'EA2']) if 'round_cat' in train_df.columns else train_df['is_ed1'] == 0].groupby('school')[TARGET].mean()
|
| 403 |
|
| 404 |
ed_boost_map = {}
|
| 405 |
-
for school in
|
| 406 |
-
if school in
|
| 407 |
-
ed_boost_map[school] =
|
| 408 |
|
| 409 |
df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
|
| 410 |
|
| 411 |
# Also compute ED2 boost
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
|
| 420 |
|
| 421 |
# Step 2: Residualize student features
|
| 422 |
student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
|
|
@@ -460,7 +455,7 @@ def add_residualized_features(df, train_mask, cat_cols, selected_features=None):
|
|
| 460 |
df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
|
| 461 |
interaction_cols.append('ed1_x_ed_boost')
|
| 462 |
if 'is_ed2' in df.columns:
|
| 463 |
-
df['ed2_x_ed2_boost'] = df['is_ed2'] * df
|
| 464 |
interaction_cols.append('ed2_x_ed2_boost')
|
| 465 |
|
| 466 |
# Step 4: Student percentile within school
|
|
|
|
| 83 |
import re
|
| 84 |
df_raw_rounds = pd.read_csv(RAW_CSV, usecols=['student_id', 'school_results_summary'])
|
| 85 |
for _, row in df_raw_rounds.iterrows():
|
| 86 |
+
sid = str(row['student_id']).replace('.0', '')
|
| 87 |
summary = str(row.get('school_results_summary', ''))
|
| 88 |
# Parse: School(Round Type):Result | ...
|
| 89 |
entries = summary.split(' | ')
|
|
|
|
| 111 |
# ============================================================
|
| 112 |
def get_detailed_round(row):
|
| 113 |
"""Map each row to detailed round: EA/ED1/ED2/REA/RD"""
|
| 114 |
+
sid = str(row.get('student_id', '')).replace('.0', '')
|
| 115 |
school = str(row.get('school', ''))
|
| 116 |
|
| 117 |
# Try to find in round_lookup
|
|
|
|
| 390 |
df['school_n_admits'] = df['school_n_admits'].fillna(0)
|
| 391 |
|
| 392 |
# Step 1b: ED boost per school (NEW)
|
| 393 |
+
# Compute ED1 admit rate per school
|
| 394 |
+
ed1_mask = train_df['is_ed1'] == 1
|
| 395 |
+
rd_mask = train_df['is_early'] == 0 # RD = not early
|
|
|
|
|
|
|
| 396 |
|
| 397 |
+
ed1_school_rates = train_df[ed1_mask].groupby('school')[TARGET].mean()
|
| 398 |
+
rd_school_rates = train_df[rd_mask].groupby('school')[TARGET].mean()
|
|
|
|
|
|
|
| 399 |
|
| 400 |
ed_boost_map = {}
|
| 401 |
+
for school in ed1_school_rates.index:
|
| 402 |
+
if school in rd_school_rates.index:
|
| 403 |
+
ed_boost_map[school] = ed1_school_rates[school] - rd_school_rates[school]
|
| 404 |
|
| 405 |
df['school_ed_boost'] = df['school'].map(ed_boost_map).fillna(0)
|
| 406 |
|
| 407 |
# Also compute ED2 boost
|
| 408 |
+
ed2_mask = train_df['is_ed2'] == 1
|
| 409 |
+
ed2_school_rates = train_df[ed2_mask].groupby('school')[TARGET].mean()
|
| 410 |
+
ed2_boost_map = {}
|
| 411 |
+
for school in ed2_school_rates.index:
|
| 412 |
+
if school in rd_school_rates.index:
|
| 413 |
+
ed2_boost_map[school] = ed2_school_rates[school] - rd_school_rates[school]
|
| 414 |
+
df['school_ed2_boost'] = df['school'].map(ed2_boost_map).fillna(0)
|
|
|
|
| 415 |
|
| 416 |
# Step 2: Residualize student features
|
| 417 |
student_feat_available = [c for c in STUDENT_LEVEL_NUMERIC if c in df.columns]
|
|
|
|
| 455 |
df['ed1_x_ed_boost'] = df['is_ed1'] * df['school_ed_boost']
|
| 456 |
interaction_cols.append('ed1_x_ed_boost')
|
| 457 |
if 'is_ed2' in df.columns:
|
| 458 |
+
df['ed2_x_ed2_boost'] = df['is_ed2'] * df['school_ed2_boost']
|
| 459 |
interaction_cols.append('ed2_x_ed2_boost')
|
| 460 |
|
| 461 |
# Step 4: Student percentile within school
|