catninja123 commited on
Commit
fdb95ea
·
verified ·
1 Parent(s): fdfb89b

Upload train_v38_2_pro_v9.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_v38_2_pro_v9.py +29 -0
train_v38_2_pro_v9.py CHANGED
@@ -68,6 +68,21 @@ ACT_LABEL_COLS = [
68
  ]
69
  N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def safe_num(v, default=np.nan):
72
  if isinstance(v, (int, float)):
73
  val = float(v)
@@ -494,6 +509,14 @@ PS_RELATED_FEATURES = set([
494
  # V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
495
  PS_RELATED_FEATURES.update(set(ps5_feature_cols))
496
 
 
 
 
 
 
 
 
 
497
  # V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
498
  ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
499
  'act_label_std', 'act_label_range'])
@@ -982,11 +1005,17 @@ for i, (fname, imp) in enumerate(fi_pairs[:30]):
982
  elif 'school_base_rate' in fname: marker = " [S]"
983
  elif 'ed_boost' in fname: marker = " [ED]"
984
  elif 'ps2_' in fname: marker = " [PS2]"
 
 
985
  elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
986
  print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
987
 
988
  # Count V9 new features in top 50
989
  v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
 
 
 
 
990
  bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
991
  print(f"\n V9 activity label features in top 50: {v9_in_top50}")
992
  print(f" act_bert_pca features in top 50: {bert_in_top50}")
 
68
  ]
69
  N_ACT_LABEL_PCA = 5 # Reduce 13 labels to 5 PCA components
70
 
71
+ # Supp V1 LLM score columns
72
+ SUPP_V1_COLS = [
73
+ 'supp_v1_school_specific_program_references',
74
+ 'supp_v1_school_specific_faculty_mentions',
75
+ 'supp_v1_school_specific_campus_features',
76
+ 'supp_v1_prompt_specific_alignment',
77
+ 'supp_v1_personal_connection_to_school',
78
+ 'supp_v1_intellectual_engagement_depth',
79
+ 'supp_v1_extracurricular_alignment',
80
+ 'supp_v1_values_alignment_with_school',
81
+ 'supp_v1_specific_future_contribution',
82
+ 'supp_v1_unique_personal_context',
83
+ 'supp_v1_mean', 'supp_v1_max', 'supp_v1_specificity',
84
+ ]
85
+
86
  def safe_num(v, default=np.nan):
87
  if isinstance(v, (int, float)):
88
  val = float(v)
 
509
  # V9+PS_V5: Add PS V5 features to PS_RELATED_FEATURES for proper residualization
510
  PS_RELATED_FEATURES.update(set(ps5_feature_cols))
511
 
512
+ # V9+SUPP_V1: Include Supp V1 LLM score features
513
+ supp_v1_in_data = [c for c in SUPP_V1_COLS if c in df.columns]
514
+ STUDENT_LEVEL_NUMERIC.extend(supp_v1_in_data)
515
+ print(f" Supp V1 features added: {len(supp_v1_in_data)} columns")
516
+
517
+ # Supp V1 features are school-level (student x school), no special PS/act handling needed
518
+ # They will use default school_mean residualization
519
+
520
  # V9: Activity label features need special handling (only has_act_labels=1 for school_mean)
521
  ACT_LABEL_FEATURES = set(ACT_LABEL_COLS + ['act_label_mean', 'act_label_max', 'act_label_min',
522
  'act_label_std', 'act_label_range'])
 
1005
  elif 'school_base_rate' in fname: marker = " [S]"
1006
  elif 'ed_boost' in fname: marker = " [ED]"
1007
  elif 'ps2_' in fname: marker = " [PS2]"
1008
+ elif 'ps5_' in fname or fname.startswith('ps_') and 'bert' not in fname: marker = " [PS_V5]"
1009
+ elif 'supp_v1_' in fname: marker = " [SUPP_V1]"
1010
  elif 'act_bert_pca' in fname: marker = " [ACT_BERT]"
1011
  print(f" {i+1:3d}. {fname:<55s} {imp:>8.2f}{marker}")
1012
 
1013
  # Count V9 new features in top 50
1014
  v9_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_label' in f or f in ACT_LABEL_COLS)
1015
+ ps5_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'ps5_' in f or (f.startswith('ps_') and 'bert' not in f and f in ps5_feature_cols))
1016
+ supp_v1_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'supp_v1_' in f)
1017
+ print(f" PS V5 features in top 50: {ps5_in_top50}")
1018
+ print(f" Supp V1 features in top 50: {supp_v1_in_top50}")
1019
  bert_in_top50 = sum(1 for f, _ in fi_pairs[:50] if 'act_bert_pca' in f)
1020
  print(f"\n V9 activity label features in top 50: {v9_in_top50}")
1021
  print(f" act_bert_pca features in top 50: {bert_in_top50}")