Upload train_v38_2_pro_v9.py with huggingface_hub
Browse files- train_v38_2_pro_v9.py +29 -0
train_v38_2_pro_v9.py
CHANGED
|
@@ -68,6 +68,21 @@ ACT_LABEL_COLS = [
|
|
| 68 |
]
|
| 69 |
N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def safe_num(v, default=np.nan):
|
| 72 |
if isinstance(v, (int, float)):
|
| 73 |
val = float(v)
|
|
@@ -494,6 +509,14 @@ PS_RELATED_FEATURES = set([
|
|
| 494 |
# V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
|
| 495 |
PS_RELATED_FEATURES.update(set(ps5_feature_cols))
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
# V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
|
| 498 |
ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
|
| 499 |
'act_label_std', 'act_label_range'])
|
|
@@ -982,11 +1005,17 @@ for i, (fname, imp) in enumerate(fi_pairs[:30]):
|
|
| 982 |
elif 'school_base_rate' in fname: marker = " [S]"
|
| 983 |
elif 'ed_boost' in fname: marker = " [ED]"
|
| 984 |
elif 'ps2_' in fname: marker = " [PS2]"
|
|
|
|
|
|
|
| 985 |
elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
|
| 986 |
print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
|
| 987 |
|
| 988 |
# Count V9 new features in top 50
|
| 989 |
v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 990 |
bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
|
| 991 |
print(f"\n V9 activity label features in top 50: {v9_in_top50}")
|
| 992 |
print(f" act_bert_pca features in top 50: {bert_in_top50}")
|
|
|
|
| 68 |
]
|
| 69 |
N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
|
| 70 |
|
| 71 |
+
# Supp V1 LLM score columns
|
| 72 |
+
SUPP_V1_COLS = [
|
| 73 |
+
'supp_v1_school_specific_program_references',
|
| 74 |
+
'supp_v1_school_specific_faculty_mentions',
|
| 75 |
+
'supp_v1_school_specific_campus_features',
|
| 76 |
+
'supp_v1_prompt_specific_alignment',
|
| 77 |
+
'supp_v1_personal_connection_to_school',
|
| 78 |
+
'supp_v1_intellectual_engagement_depth',
|
| 79 |
+
'supp_v1_extracurricular_alignment',
|
| 80 |
+
'supp_v1_values_alignment_with_school',
|
| 81 |
+
'supp_v1_specific_future_contribution',
|
| 82 |
+
'supp_v1_unique_personal_context',
|
| 83 |
+
'supp_v1_mean', 'supp_v1_max', 'supp_v1_specificity',
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
def safe_num(v, default=np.nan):
|
| 87 |
if isinstance(v, (int, float)):
|
| 88 |
val = float(v)
|
|
|
|
| 509 |
# V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
|
| 510 |
PS_RELATED_FEATURES.update(set(ps5_feature_cols))
|
| 511 |
|
| 512 |
+
# V9+SUPP_V1: Include Supp V1 LLM score features
|
| 513 |
+
supp_v1_in_data = [c for c in SUPP_V1_COLS if c in df.columns]
|
| 514 |
+
STUDENT_LEVEL_NUMERIC.extend(supp_v1_in_data)
|
| 515 |
+
print(f" Supp V1 features added: {len(supp_v1_in_data)} columns")
|
| 516 |
+
|
| 517 |
+
# Supp V1 features are school-level (student x school), no special PS/act handling needed
|
| 518 |
+
# They will use default school_mean residualization
|
| 519 |
+
|
| 520 |
# V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
|
| 521 |
ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
|
| 522 |
'act_label_std', 'act_label_range'])
|
|
|
|
| 1005 |
elif 'school_base_rate' in fname: marker = " [S]"
|
| 1006 |
elif 'ed_boost' in fname: marker = " [ED]"
|
| 1007 |
elif 'ps2_' in fname: marker = " [PS2]"
|
| 1008 |
+
elif 'ps5_' in fname or fname.startswith('ps_') and 'bert' not in fname: marker = " [PS_V5]"
|
| 1009 |
+
elif 'supp_v1_' in fname: marker = " [SUPP_V1]"
|
| 1010 |
elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
|
| 1011 |
print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
|
| 1012 |
|
| 1013 |
# Count V9 new features in top 50
|
| 1014 |
v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
|
| 1015 |
+
ps5_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'ps5_' in f or (f.startswith('ps_') and 'bert' not in f and f in ps5_feature_cols))
|
| 1016 |
+
supp_v1_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'supp_v1_' in f)
|
| 1017 |
+
print(f" PS V5 features in top 50: {ps5_in_top50}")
|
| 1018 |
+
print(f" Supp V1 features in top 50: {supp_v1_in_top50}")
|
| 1019 |
bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
|
| 1020 |
print(f"\n V9 activity label features in top 50: {v9_in_top50}")
|
| 1021 |
print(f" act_bert_pca features in top 50: {bert_in_top50}")
|