Upload 28 files
Browse files- checkpoint_results/mmlu_zero_shot_evaluation_results_base_model.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-1000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-10000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-11000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-12000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-13000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-14000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-15000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-16000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-17000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-18000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-19000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-2000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-20000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-21000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-22000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-23000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-24000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-25000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-26000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-26379.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-3000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-4000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-5000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-6000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-7000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-8000.json +59 -0
- checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-9000.json +59 -0
checkpoint_results/mmlu_zero_shot_evaluation_results_base_model.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.27,
|
| 3 |
+
"anatomy": 0.5777777777777777,
|
| 4 |
+
"astronomy": 0.631578947368421,
|
| 5 |
+
"business_ethics": 0.62,
|
| 6 |
+
"clinical_knowledge": 0.6830188679245283,
|
| 7 |
+
"college_biology": 0.7152777777777778,
|
| 8 |
+
"college_chemistry": 0.43,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.32,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.39215686274509803,
|
| 13 |
+
"computer_security": 0.68,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.4473684210526316,
|
| 16 |
+
"electrical_engineering": 0.593103448275862,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.4126984126984127,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7354838709677419,
|
| 21 |
+
"high_school_chemistry": 0.4827586206896552,
|
| 22 |
+
"high_school_computer_science": 0.65,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7474747474747475,
|
| 25 |
+
"high_school_government_and_politics": 0.8652849740932642,
|
| 26 |
+
"high_school_macroeconomics": 0.5487179487179488,
|
| 27 |
+
"high_school_mathematics": 0.3148148148148148,
|
| 28 |
+
"high_school_microeconomics": 0.6092436974789915,
|
| 29 |
+
"high_school_physics": 0.31125827814569534,
|
| 30 |
+
"high_school_psychology": 0.8238532110091743,
|
| 31 |
+
"high_school_statistics": 0.4861111111111111,
|
| 32 |
+
"high_school_us_history": 0.7941176470588235,
|
| 33 |
+
"high_school_world_history": 0.7721518987341772,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7520661157024794,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7730061349693251,
|
| 39 |
+
"machine_learning": 0.5625,
|
| 40 |
+
"management": 0.7378640776699029,
|
| 41 |
+
"marketing": 0.8290598290598291,
|
| 42 |
+
"medical_genetics": 0.71,
|
| 43 |
+
"miscellaneous": 0.7701149425287356,
|
| 44 |
+
"moral_disputes": 0.6878612716763006,
|
| 45 |
+
"moral_scenarios": 0.27150837988826815,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6430868167202572,
|
| 48 |
+
"prehistory": 0.7037037037037037,
|
| 49 |
+
"professional_accounting": 0.46808510638297873,
|
| 50 |
+
"professional_law": 0.4530638852672751,
|
| 51 |
+
"professional_medicine": 0.6507352941176471,
|
| 52 |
+
"professional_psychology": 0.6143790849673203,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6938775510204082,
|
| 55 |
+
"sociology": 0.8258706467661692,
|
| 56 |
+
"us_foreign_policy": 0.86,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-1000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.27,
|
| 3 |
+
"anatomy": 0.42962962962962964,
|
| 4 |
+
"astronomy": 0.506578947368421,
|
| 5 |
+
"business_ethics": 0.57,
|
| 6 |
+
"clinical_knowledge": 0.630188679245283,
|
| 7 |
+
"college_biology": 0.6111111111111112,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.34,
|
| 11 |
+
"college_medicine": 0.45664739884393063,
|
| 12 |
+
"college_physics": 0.3235294117647059,
|
| 13 |
+
"computer_security": 0.57,
|
| 14 |
+
"conceptual_physics": 0.4085106382978723,
|
| 15 |
+
"econometrics": 0.41228070175438597,
|
| 16 |
+
"electrical_engineering": 0.5241379310344828,
|
| 17 |
+
"elementary_mathematics": 0.328042328042328,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.33,
|
| 20 |
+
"high_school_biology": 0.6483870967741936,
|
| 21 |
+
"high_school_chemistry": 0.4187192118226601,
|
| 22 |
+
"high_school_computer_science": 0.66,
|
| 23 |
+
"high_school_european_history": 0.5212121212121212,
|
| 24 |
+
"high_school_geography": 0.6262626262626263,
|
| 25 |
+
"high_school_government_and_politics": 0.7823834196891192,
|
| 26 |
+
"high_school_macroeconomics": 0.5358974358974359,
|
| 27 |
+
"high_school_mathematics": 0.28888888888888886,
|
| 28 |
+
"high_school_microeconomics": 0.5336134453781513,
|
| 29 |
+
"high_school_physics": 0.2052980132450331,
|
| 30 |
+
"high_school_psychology": 0.7045871559633028,
|
| 31 |
+
"high_school_statistics": 0.4027777777777778,
|
| 32 |
+
"high_school_us_history": 0.6519607843137255,
|
| 33 |
+
"high_school_world_history": 0.6033755274261603,
|
| 34 |
+
"human_aging": 0.5695067264573991,
|
| 35 |
+
"human_sexuality": 0.5801526717557252,
|
| 36 |
+
"international_law": 0.6859504132231405,
|
| 37 |
+
"jurisprudence": 0.6481481481481481,
|
| 38 |
+
"logical_fallacies": 0.6503067484662577,
|
| 39 |
+
"machine_learning": 0.5,
|
| 40 |
+
"management": 0.5631067961165048,
|
| 41 |
+
"marketing": 0.7649572649572649,
|
| 42 |
+
"medical_genetics": 0.63,
|
| 43 |
+
"miscellaneous": 0.6091954022988506,
|
| 44 |
+
"moral_disputes": 0.6560693641618497,
|
| 45 |
+
"moral_scenarios": 0.21675977653631284,
|
| 46 |
+
"nutrition": 0.6209150326797386,
|
| 47 |
+
"philosophy": 0.6045016077170418,
|
| 48 |
+
"prehistory": 0.5679012345679012,
|
| 49 |
+
"professional_accounting": 0.375886524822695,
|
| 50 |
+
"professional_law": 0.394393741851369,
|
| 51 |
+
"professional_medicine": 0.5845588235294118,
|
| 52 |
+
"professional_psychology": 0.5049019607843137,
|
| 53 |
+
"public_relations": 0.5545454545454546,
|
| 54 |
+
"security_studies": 0.6448979591836734,
|
| 55 |
+
"sociology": 0.7412935323383084,
|
| 56 |
+
"us_foreign_policy": 0.75,
|
| 57 |
+
"virology": 0.46987951807228917,
|
| 58 |
+
"world_religions": 0.6783625730994152
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-10000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.56,
|
| 6 |
+
"clinical_knowledge": 0.6830188679245283,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.46,
|
| 10 |
+
"college_mathematics": 0.27,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.7,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.47368421052631576,
|
| 16 |
+
"electrical_engineering": 0.5655172413793104,
|
| 17 |
+
"elementary_mathematics": 0.3968253968253968,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.36,
|
| 20 |
+
"high_school_biology": 0.7580645161290323,
|
| 21 |
+
"high_school_chemistry": 0.4729064039408867,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7373737373737373,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5512820512820513,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.592436974789916,
|
| 29 |
+
"high_school_physics": 0.2913907284768212,
|
| 30 |
+
"high_school_psychology": 0.8055045871559633,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.803921568627451,
|
| 33 |
+
"high_school_world_history": 0.7637130801687764,
|
| 34 |
+
"human_aging": 0.6547085201793722,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5446428571428571,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8247863247863247,
|
| 42 |
+
"medical_genetics": 0.78,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6734104046242775,
|
| 45 |
+
"moral_scenarios": 0.2659217877094972,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6430868167202572,
|
| 48 |
+
"prehistory": 0.6975308641975309,
|
| 49 |
+
"professional_accounting": 0.45390070921985815,
|
| 50 |
+
"professional_law": 0.4556714471968709,
|
| 51 |
+
"professional_medicine": 0.6507352941176471,
|
| 52 |
+
"professional_psychology": 0.6258169934640523,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8208955223880597,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-11000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.57,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7152777777777778,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.46,
|
| 10 |
+
"college_mathematics": 0.27,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.69,
|
| 14 |
+
"conceptual_physics": 0.5106382978723404,
|
| 15 |
+
"econometrics": 0.4824561403508772,
|
| 16 |
+
"electrical_engineering": 0.5793103448275863,
|
| 17 |
+
"elementary_mathematics": 0.4021164021164021,
|
| 18 |
+
"formal_logic": 0.4365079365079365,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7580645161290323,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7393939393939394,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5538461538461539,
|
| 27 |
+
"high_school_mathematics": 0.3111111111111111,
|
| 28 |
+
"high_school_microeconomics": 0.5966386554621849,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8091743119266055,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7679324894514767,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5446428571428571,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.811965811965812,
|
| 42 |
+
"medical_genetics": 0.75,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6734104046242775,
|
| 45 |
+
"moral_scenarios": 0.2659217877094972,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.7067901234567902,
|
| 49 |
+
"professional_accounting": 0.45390070921985815,
|
| 50 |
+
"professional_law": 0.4576271186440678,
|
| 51 |
+
"professional_medicine": 0.6360294117647058,
|
| 52 |
+
"professional_psychology": 0.6258169934640523,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.86,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-12000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.58,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.3,
|
| 11 |
+
"college_medicine": 0.6011560693641619,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.69,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.47368421052631576,
|
| 16 |
+
"electrical_engineering": 0.5862068965517241,
|
| 17 |
+
"elementary_mathematics": 0.4074074074074074,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.4,
|
| 20 |
+
"high_school_biology": 0.7612903225806451,
|
| 21 |
+
"high_school_chemistry": 0.4827586206896552,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7373737373737373,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.5966386554621849,
|
| 29 |
+
"high_school_physics": 0.2913907284768212,
|
| 30 |
+
"high_school_psychology": 0.8073394495412844,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.803921568627451,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5446428571428571,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6705202312138728,
|
| 45 |
+
"moral_scenarios": 0.2681564245810056,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6559485530546624,
|
| 48 |
+
"prehistory": 0.6975308641975309,
|
| 49 |
+
"professional_accounting": 0.45390070921985815,
|
| 50 |
+
"professional_law": 0.4576271186440678,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.6241830065359477,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8208955223880597,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-13000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.5481481481481482,
|
| 4 |
+
"astronomy": 0.5855263157894737,
|
| 5 |
+
"business_ethics": 0.58,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.72,
|
| 14 |
+
"conceptual_physics": 0.5276595744680851,
|
| 15 |
+
"econometrics": 0.4473684210526316,
|
| 16 |
+
"electrical_engineering": 0.6,
|
| 17 |
+
"elementary_mathematics": 0.4126984126984127,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.4,
|
| 20 |
+
"high_school_biology": 0.7580645161290323,
|
| 21 |
+
"high_school_chemistry": 0.47783251231527096,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7393939393939394,
|
| 24 |
+
"high_school_geography": 0.7424242424242424,
|
| 25 |
+
"high_school_government_and_politics": 0.8393782383419689,
|
| 26 |
+
"high_school_macroeconomics": 0.5538461538461539,
|
| 27 |
+
"high_school_mathematics": 0.3148148148148148,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8018348623853211,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6547085201793722,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8247863247863247,
|
| 42 |
+
"medical_genetics": 0.78,
|
| 43 |
+
"miscellaneous": 0.7522349936143039,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6495176848874598,
|
| 48 |
+
"prehistory": 0.691358024691358,
|
| 49 |
+
"professional_accounting": 0.4574468085106383,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6209150326797386,
|
| 53 |
+
"public_relations": 0.6545454545454545,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.86,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-14000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.3,
|
| 11 |
+
"college_medicine": 0.6069364161849711,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.7,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.49122807017543857,
|
| 16 |
+
"electrical_engineering": 0.593103448275862,
|
| 17 |
+
"elementary_mathematics": 0.4074074074074074,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.4,
|
| 20 |
+
"high_school_biology": 0.7612903225806451,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7272727272727273,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5564102564102564,
|
| 27 |
+
"high_school_mathematics": 0.3111111111111111,
|
| 28 |
+
"high_school_microeconomics": 0.592436974789916,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8018348623853211,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5535714285714286,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8247863247863247,
|
| 42 |
+
"medical_genetics": 0.78,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6734104046242775,
|
| 45 |
+
"moral_scenarios": 0.26927374301675977,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6559485530546624,
|
| 48 |
+
"prehistory": 0.6944444444444444,
|
| 49 |
+
"professional_accounting": 0.4574468085106383,
|
| 50 |
+
"professional_law": 0.4576271186440678,
|
| 51 |
+
"professional_medicine": 0.6507352941176471,
|
| 52 |
+
"professional_psychology": 0.6241830065359477,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8208955223880597,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8128654970760234
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-15000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.31,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.3,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5234042553191489,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6,
|
| 17 |
+
"elementary_mathematics": 0.4021164021164021,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.4,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.49261083743842365,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5641025641025641,
|
| 27 |
+
"high_school_mathematics": 0.3148148148148148,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8018348623853211,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6591928251121076,
|
| 35 |
+
"human_sexuality": 0.7175572519083969,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8290598290598291,
|
| 42 |
+
"medical_genetics": 0.78,
|
| 43 |
+
"miscellaneous": 0.7586206896551724,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.26927374301675977,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6433823529411765,
|
| 52 |
+
"professional_psychology": 0.6241830065359477,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8208955223880597,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-16000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.5855263157894737,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6792452830188679,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.43,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.3,
|
| 11 |
+
"college_medicine": 0.5838150289017341,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5106382978723404,
|
| 15 |
+
"econometrics": 0.45614035087719296,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.4365079365079365,
|
| 19 |
+
"global_facts": 0.38,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.47783251231527096,
|
| 22 |
+
"high_school_computer_science": 0.65,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5564102564102564,
|
| 27 |
+
"high_school_mathematics": 0.3111111111111111,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8055045871559633,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.803921568627451,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.7175572519083969,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8247863247863247,
|
| 42 |
+
"medical_genetics": 0.76,
|
| 43 |
+
"miscellaneous": 0.7573435504469987,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27262569832402234,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.639871382636656,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.46099290780141844,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6258169934640523,
|
| 53 |
+
"public_relations": 0.6636363636363637,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8109452736318408,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.7953216374269005
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-17000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.6052631578947368,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6830188679245283,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.43,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.6011560693641619,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.72,
|
| 14 |
+
"conceptual_physics": 0.502127659574468,
|
| 15 |
+
"econometrics": 0.45614035087719296,
|
| 16 |
+
"electrical_engineering": 0.593103448275862,
|
| 17 |
+
"elementary_mathematics": 0.4074074074074074,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.4,
|
| 20 |
+
"high_school_biology": 0.7483870967741936,
|
| 21 |
+
"high_school_chemistry": 0.46798029556650245,
|
| 22 |
+
"high_school_computer_science": 0.66,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5564102564102564,
|
| 27 |
+
"high_school_mathematics": 0.2962962962962963,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.31125827814569534,
|
| 30 |
+
"high_school_psychology": 0.8073394495412844,
|
| 31 |
+
"high_school_statistics": 0.4398148148148148,
|
| 32 |
+
"high_school_us_history": 0.803921568627451,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6591928251121076,
|
| 35 |
+
"human_sexuality": 0.7175572519083969,
|
| 36 |
+
"international_law": 0.7272727272727273,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7281553398058253,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.76,
|
| 43 |
+
"miscellaneous": 0.7598978288633461,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27932960893854747,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6302250803858521,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.46099290780141844,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6433823529411765,
|
| 52 |
+
"professional_psychology": 0.6209150326797386,
|
| 53 |
+
"public_relations": 0.6636363636363637,
|
| 54 |
+
"security_studies": 0.6693877551020408,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.7953216374269005
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-18000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6679245283018868,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.43,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.3,
|
| 11 |
+
"college_medicine": 0.6011560693641619,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6,
|
| 17 |
+
"elementary_mathematics": 0.4021164021164021,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4827586206896552,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3111111111111111,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6591928251121076,
|
| 35 |
+
"human_sexuality": 0.7175572519083969,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.76,
|
| 43 |
+
"miscellaneous": 0.7573435504469987,
|
| 44 |
+
"moral_disputes": 0.6763005780346821,
|
| 45 |
+
"moral_scenarios": 0.2748603351955307,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6882716049382716,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.45697522816166886,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6209150326797386,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8109452736318408,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-19000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.31,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.6,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6137931034482759,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6681614349775785,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6882716049382716,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.4556714471968709,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.6209150326797386,
|
| 53 |
+
"public_relations": 0.6636363636363637,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-2000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.6052631578947368,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6754716981132075,
|
| 7 |
+
"college_biology": 0.6875,
|
| 8 |
+
"college_chemistry": 0.41,
|
| 9 |
+
"college_computer_science": 0.45,
|
| 10 |
+
"college_mathematics": 0.35,
|
| 11 |
+
"college_medicine": 0.5838150289017341,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.69,
|
| 14 |
+
"conceptual_physics": 0.5063829787234042,
|
| 15 |
+
"econometrics": 0.45614035087719296,
|
| 16 |
+
"electrical_engineering": 0.5793103448275863,
|
| 17 |
+
"elementary_mathematics": 0.3941798941798942,
|
| 18 |
+
"formal_logic": 0.4523809523809524,
|
| 19 |
+
"global_facts": 0.36,
|
| 20 |
+
"high_school_biology": 0.7580645161290323,
|
| 21 |
+
"high_school_chemistry": 0.47783251231527096,
|
| 22 |
+
"high_school_computer_science": 0.66,
|
| 23 |
+
"high_school_european_history": 0.696969696969697,
|
| 24 |
+
"high_school_geography": 0.7474747474747475,
|
| 25 |
+
"high_school_government_and_politics": 0.8393782383419689,
|
| 26 |
+
"high_school_macroeconomics": 0.5487179487179488,
|
| 27 |
+
"high_school_mathematics": 0.3148148148148148,
|
| 28 |
+
"high_school_microeconomics": 0.6008403361344538,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8165137614678899,
|
| 31 |
+
"high_school_statistics": 0.4675925925925926,
|
| 32 |
+
"high_school_us_history": 0.7941176470588235,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.7099236641221374,
|
| 36 |
+
"international_law": 0.7272727272727273,
|
| 37 |
+
"jurisprudence": 0.7314814814814815,
|
| 38 |
+
"logical_fallacies": 0.7300613496932515,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7378640776699029,
|
| 41 |
+
"marketing": 0.8034188034188035,
|
| 42 |
+
"medical_genetics": 0.71,
|
| 43 |
+
"miscellaneous": 0.7586206896551724,
|
| 44 |
+
"moral_disputes": 0.6907514450867052,
|
| 45 |
+
"moral_scenarios": 0.24581005586592178,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6270096463022508,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.450354609929078,
|
| 50 |
+
"professional_law": 0.4556714471968709,
|
| 51 |
+
"professional_medicine": 0.6360294117647058,
|
| 52 |
+
"professional_psychology": 0.6143790849673203,
|
| 53 |
+
"public_relations": 0.6545454545454545,
|
| 54 |
+
"security_studies": 0.673469387755102,
|
| 55 |
+
"sociology": 0.8308457711442786,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.4939759036144578,
|
| 58 |
+
"world_religions": 0.7660818713450293
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-20000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5855263157894737,
|
| 5 |
+
"business_ethics": 0.6,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6137931034482759,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4827586206896552,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7393939393939394,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.26927374301675977,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.691358024691358,
|
| 49 |
+
"professional_accounting": 0.46808510638297873,
|
| 50 |
+
"professional_law": 0.4556714471968709,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-21000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6137931034482759,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5615384615384615,
|
| 27 |
+
"high_school_mathematics": 0.3,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.2681564245810056,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.691358024691358,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.45632333767926986,
|
| 51 |
+
"professional_medicine": 0.6433823529411765,
|
| 52 |
+
"professional_psychology": 0.6176470588235294,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-22000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.31,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6641509433962264,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.49,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6137931034482759,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.4556714471968709,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-23000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.6,
|
| 6 |
+
"clinical_knowledge": 0.6679245283018868,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7393939393939394,
|
| 24 |
+
"high_school_geography": 0.7272727272727273,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5615384615384615,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6763005780346821,
|
| 45 |
+
"moral_scenarios": 0.27150837988826815,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.6209150326797386,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-24000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6641509433962264,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5615384615384615,
|
| 27 |
+
"high_school_mathematics": 0.3074074074074074,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.4645390070921986,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-25000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.3,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6641509433962264,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3074074074074074,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5178571428571429,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6763005780346821,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.46808510638297873,
|
| 50 |
+
"professional_law": 0.45436766623207303,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-26000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6641509433962264,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3074074074074074,
|
| 28 |
+
"high_school_microeconomics": 0.5840336134453782,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.46808510638297873,
|
| 50 |
+
"professional_law": 0.45371577574967403,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-26379.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6679245283018868,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.42,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5895953757225434,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6068965517241379,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4876847290640394,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3074074074074074,
|
| 28 |
+
"high_school_microeconomics": 0.5882352941176471,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8036697247706422,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6636771300448431,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7407407407407407,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.77,
|
| 43 |
+
"miscellaneous": 0.7535121328224776,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.27039106145251396,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.6851851851851852,
|
| 49 |
+
"professional_accounting": 0.46808510638297873,
|
| 50 |
+
"professional_law": 0.45371577574967403,
|
| 51 |
+
"professional_medicine": 0.6397058823529411,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.85,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8011695906432749
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-3000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5407407407407407,
|
| 4 |
+
"astronomy": 0.5986842105263158,
|
| 5 |
+
"business_ethics": 0.59,
|
| 6 |
+
"clinical_knowledge": 0.6830188679245283,
|
| 7 |
+
"college_biology": 0.6875,
|
| 8 |
+
"college_chemistry": 0.4,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.33,
|
| 11 |
+
"college_medicine": 0.5780346820809249,
|
| 12 |
+
"college_physics": 0.38235294117647056,
|
| 13 |
+
"computer_security": 0.7,
|
| 14 |
+
"conceptual_physics": 0.502127659574468,
|
| 15 |
+
"econometrics": 0.4824561403508772,
|
| 16 |
+
"electrical_engineering": 0.6,
|
| 17 |
+
"elementary_mathematics": 0.3968253968253968,
|
| 18 |
+
"formal_logic": 0.4603174603174603,
|
| 19 |
+
"global_facts": 0.41,
|
| 20 |
+
"high_school_biology": 0.7516129032258064,
|
| 21 |
+
"high_school_chemistry": 0.4827586206896552,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7272727272727273,
|
| 24 |
+
"high_school_geography": 0.7373737373737373,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5538461538461539,
|
| 27 |
+
"high_school_mathematics": 0.32222222222222224,
|
| 28 |
+
"high_school_microeconomics": 0.6008403361344538,
|
| 29 |
+
"high_school_physics": 0.31125827814569534,
|
| 30 |
+
"high_school_psychology": 0.8165137614678899,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.7941176470588235,
|
| 33 |
+
"high_school_world_history": 0.7510548523206751,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.7099236641221374,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7222222222222222,
|
| 38 |
+
"logical_fallacies": 0.7484662576687117,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7184466019417476,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.75,
|
| 43 |
+
"miscellaneous": 0.7471264367816092,
|
| 44 |
+
"moral_disputes": 0.6907514450867052,
|
| 45 |
+
"moral_scenarios": 0.25921787709497207,
|
| 46 |
+
"nutrition": 0.6601307189542484,
|
| 47 |
+
"philosophy": 0.6270096463022508,
|
| 48 |
+
"prehistory": 0.6820987654320988,
|
| 49 |
+
"professional_accounting": 0.4574468085106383,
|
| 50 |
+
"professional_law": 0.45436766623207303,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6160130718954249,
|
| 53 |
+
"public_relations": 0.6818181818181818,
|
| 54 |
+
"security_studies": 0.673469387755102,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.83,
|
| 57 |
+
"virology": 0.5060240963855421,
|
| 58 |
+
"world_religions": 0.7719298245614035
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-4000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.28,
|
| 3 |
+
"anatomy": 0.5481481481481482,
|
| 4 |
+
"astronomy": 0.5986842105263158,
|
| 5 |
+
"business_ethics": 0.57,
|
| 6 |
+
"clinical_knowledge": 0.690566037735849,
|
| 7 |
+
"college_biology": 0.6875,
|
| 8 |
+
"college_chemistry": 0.41,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.32,
|
| 11 |
+
"college_medicine": 0.5606936416184971,
|
| 12 |
+
"college_physics": 0.37254901960784315,
|
| 13 |
+
"computer_security": 0.69,
|
| 14 |
+
"conceptual_physics": 0.502127659574468,
|
| 15 |
+
"econometrics": 0.47368421052631576,
|
| 16 |
+
"electrical_engineering": 0.6137931034482759,
|
| 17 |
+
"elementary_mathematics": 0.3941798941798942,
|
| 18 |
+
"formal_logic": 0.4523809523809524,
|
| 19 |
+
"global_facts": 0.38,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4630541871921182,
|
| 22 |
+
"high_school_computer_science": 0.62,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7424242424242424,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5461538461538461,
|
| 27 |
+
"high_school_mathematics": 0.3296296296296296,
|
| 28 |
+
"high_school_microeconomics": 0.6050420168067226,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8110091743119267,
|
| 31 |
+
"high_school_statistics": 0.4444444444444444,
|
| 32 |
+
"high_school_us_history": 0.7941176470588235,
|
| 33 |
+
"high_school_world_history": 0.7552742616033755,
|
| 34 |
+
"human_aging": 0.6547085201793722,
|
| 35 |
+
"human_sexuality": 0.7175572519083969,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7222222222222222,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7281553398058253,
|
| 41 |
+
"marketing": 0.8290598290598291,
|
| 42 |
+
"medical_genetics": 0.72,
|
| 43 |
+
"miscellaneous": 0.7484035759897829,
|
| 44 |
+
"moral_disputes": 0.6878612716763006,
|
| 45 |
+
"moral_scenarios": 0.2681564245810056,
|
| 46 |
+
"nutrition": 0.6666666666666666,
|
| 47 |
+
"philosophy": 0.617363344051447,
|
| 48 |
+
"prehistory": 0.6790123456790124,
|
| 49 |
+
"professional_accounting": 0.4432624113475177,
|
| 50 |
+
"professional_law": 0.4576271186440678,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6176470588235294,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6693877551020408,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.83,
|
| 57 |
+
"virology": 0.5060240963855421,
|
| 58 |
+
"world_religions": 0.7777777777777778
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-5000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5555555555555556,
|
| 4 |
+
"astronomy": 0.5986842105263158,
|
| 5 |
+
"business_ethics": 0.58,
|
| 6 |
+
"clinical_knowledge": 0.6830188679245283,
|
| 7 |
+
"college_biology": 0.7152777777777778,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.47,
|
| 10 |
+
"college_mathematics": 0.31,
|
| 11 |
+
"college_medicine": 0.5838150289017341,
|
| 12 |
+
"college_physics": 0.3627450980392157,
|
| 13 |
+
"computer_security": 0.69,
|
| 14 |
+
"conceptual_physics": 0.5106382978723404,
|
| 15 |
+
"econometrics": 0.4649122807017544,
|
| 16 |
+
"electrical_engineering": 0.6206896551724138,
|
| 17 |
+
"elementary_mathematics": 0.41534391534391535,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.38,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4729064039408867,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7424242424242424,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3296296296296296,
|
| 28 |
+
"high_school_microeconomics": 0.6134453781512605,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8091743119266055,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7637130801687764,
|
| 34 |
+
"human_aging": 0.6457399103139013,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.7592592592592593,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5357142857142857,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.811965811965812,
|
| 42 |
+
"medical_genetics": 0.72,
|
| 43 |
+
"miscellaneous": 0.7611749680715197,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.26256983240223464,
|
| 46 |
+
"nutrition": 0.6633986928104575,
|
| 47 |
+
"philosophy": 0.6463022508038585,
|
| 48 |
+
"prehistory": 0.691358024691358,
|
| 49 |
+
"professional_accounting": 0.44680851063829785,
|
| 50 |
+
"professional_law": 0.45632333767926986,
|
| 51 |
+
"professional_medicine": 0.6544117647058824,
|
| 52 |
+
"professional_psychology": 0.619281045751634,
|
| 53 |
+
"public_relations": 0.6818181818181818,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8308457711442786,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5120481927710844,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-6000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5481481481481482,
|
| 4 |
+
"astronomy": 0.5986842105263158,
|
| 5 |
+
"business_ethics": 0.57,
|
| 6 |
+
"clinical_knowledge": 0.690566037735849,
|
| 7 |
+
"college_biology": 0.7013888888888888,
|
| 8 |
+
"college_chemistry": 0.43,
|
| 9 |
+
"college_computer_science": 0.48,
|
| 10 |
+
"college_mathematics": 0.32,
|
| 11 |
+
"college_medicine": 0.5838150289017341,
|
| 12 |
+
"college_physics": 0.38235294117647056,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5191489361702127,
|
| 15 |
+
"econometrics": 0.49122807017543857,
|
| 16 |
+
"electrical_engineering": 0.6206896551724138,
|
| 17 |
+
"elementary_mathematics": 0.40476190476190477,
|
| 18 |
+
"formal_logic": 0.4365079365079365,
|
| 19 |
+
"global_facts": 0.37,
|
| 20 |
+
"high_school_biology": 0.7483870967741936,
|
| 21 |
+
"high_school_chemistry": 0.46798029556650245,
|
| 22 |
+
"high_school_computer_science": 0.64,
|
| 23 |
+
"high_school_european_history": 0.7454545454545455,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5512820512820513,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.6092436974789915,
|
| 29 |
+
"high_school_physics": 0.31125827814569534,
|
| 30 |
+
"high_school_psychology": 0.8091743119266055,
|
| 31 |
+
"high_school_statistics": 0.44907407407407407,
|
| 32 |
+
"high_school_us_history": 0.803921568627451,
|
| 33 |
+
"high_school_world_history": 0.759493670886076,
|
| 34 |
+
"human_aging": 0.6457399103139013,
|
| 35 |
+
"human_sexuality": 0.7251908396946565,
|
| 36 |
+
"international_law": 0.71900826446281,
|
| 37 |
+
"jurisprudence": 0.75,
|
| 38 |
+
"logical_fallacies": 0.7484662576687117,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7378640776699029,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.75,
|
| 43 |
+
"miscellaneous": 0.756066411238825,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.2659217877094972,
|
| 46 |
+
"nutrition": 0.6666666666666666,
|
| 47 |
+
"philosophy": 0.639871382636656,
|
| 48 |
+
"prehistory": 0.6975308641975309,
|
| 49 |
+
"professional_accounting": 0.44680851063829785,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6507352941176471,
|
| 52 |
+
"professional_psychology": 0.6225490196078431,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6816326530612244,
|
| 55 |
+
"sociology": 0.8258706467661692,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-7000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5481481481481482,
|
| 4 |
+
"astronomy": 0.5921052631578947,
|
| 5 |
+
"business_ethics": 0.57,
|
| 6 |
+
"clinical_knowledge": 0.6867924528301886,
|
| 7 |
+
"college_biology": 0.7152777777777778,
|
| 8 |
+
"college_chemistry": 0.45,
|
| 9 |
+
"college_computer_science": 0.46,
|
| 10 |
+
"college_mathematics": 0.28,
|
| 11 |
+
"college_medicine": 0.6011560693641619,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.68,
|
| 14 |
+
"conceptual_physics": 0.5234042553191489,
|
| 15 |
+
"econometrics": 0.4824561403508772,
|
| 16 |
+
"electrical_engineering": 0.593103448275862,
|
| 17 |
+
"elementary_mathematics": 0.41005291005291006,
|
| 18 |
+
"formal_logic": 0.42857142857142855,
|
| 19 |
+
"global_facts": 0.36,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.46798029556650245,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7393939393939394,
|
| 24 |
+
"high_school_geography": 0.7373737373737373,
|
| 25 |
+
"high_school_government_and_politics": 0.844559585492228,
|
| 26 |
+
"high_school_macroeconomics": 0.5538461538461539,
|
| 27 |
+
"high_school_mathematics": 0.3,
|
| 28 |
+
"high_school_microeconomics": 0.6050420168067226,
|
| 29 |
+
"high_school_physics": 0.31125827814569534,
|
| 30 |
+
"high_school_psychology": 0.8091743119266055,
|
| 31 |
+
"high_school_statistics": 0.4398148148148148,
|
| 32 |
+
"high_school_us_history": 0.7892156862745098,
|
| 33 |
+
"high_school_world_history": 0.7637130801687764,
|
| 34 |
+
"human_aging": 0.6457399103139013,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7592592592592593,
|
| 38 |
+
"logical_fallacies": 0.7423312883435583,
|
| 39 |
+
"machine_learning": 0.5446428571428571,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.76,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.2681564245810056,
|
| 46 |
+
"nutrition": 0.673202614379085,
|
| 47 |
+
"philosophy": 0.6430868167202572,
|
| 48 |
+
"prehistory": 0.7006172839506173,
|
| 49 |
+
"professional_accounting": 0.45390070921985815,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6274509803921569,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.83,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8128654970760234
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-8000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.5481481481481482,
|
| 4 |
+
"astronomy": 0.5855263157894737,
|
| 5 |
+
"business_ethics": 0.56,
|
| 6 |
+
"clinical_knowledge": 0.6792452830188679,
|
| 7 |
+
"college_biology": 0.7152777777777778,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.46,
|
| 10 |
+
"college_mathematics": 0.29,
|
| 11 |
+
"college_medicine": 0.5838150289017341,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.72,
|
| 14 |
+
"conceptual_physics": 0.5319148936170213,
|
| 15 |
+
"econometrics": 0.4824561403508772,
|
| 16 |
+
"electrical_engineering": 0.5655172413793104,
|
| 17 |
+
"elementary_mathematics": 0.4074074074074074,
|
| 18 |
+
"formal_logic": 0.4365079365079365,
|
| 19 |
+
"global_facts": 0.37,
|
| 20 |
+
"high_school_biology": 0.7548387096774194,
|
| 21 |
+
"high_school_chemistry": 0.4729064039408867,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7373737373737373,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.5564102564102564,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.6008403361344538,
|
| 29 |
+
"high_school_physics": 0.304635761589404,
|
| 30 |
+
"high_school_psychology": 0.8073394495412844,
|
| 31 |
+
"high_school_statistics": 0.4351851851851852,
|
| 32 |
+
"high_school_us_history": 0.7941176470588235,
|
| 33 |
+
"high_school_world_history": 0.7679324894514767,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7592592592592593,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5267857142857143,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.8205128205128205,
|
| 42 |
+
"medical_genetics": 0.76,
|
| 43 |
+
"miscellaneous": 0.7535121328224776,
|
| 44 |
+
"moral_disputes": 0.6734104046242775,
|
| 45 |
+
"moral_scenarios": 0.2670391061452514,
|
| 46 |
+
"nutrition": 0.6764705882352942,
|
| 47 |
+
"philosophy": 0.6527331189710611,
|
| 48 |
+
"prehistory": 0.7037037037037037,
|
| 49 |
+
"professional_accounting": 0.45390070921985815,
|
| 50 |
+
"professional_law": 0.455019556714472,
|
| 51 |
+
"professional_medicine": 0.6470588235294118,
|
| 52 |
+
"professional_psychology": 0.6241830065359477,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8208955223880597,
|
| 56 |
+
"us_foreign_policy": 0.83,
|
| 57 |
+
"virology": 0.5180722891566265,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|
checkpoint_results/mmlu_zero_shot_evaluation_results_checkpoint-9000.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"abstract_algebra": 0.29,
|
| 3 |
+
"anatomy": 0.562962962962963,
|
| 4 |
+
"astronomy": 0.5855263157894737,
|
| 5 |
+
"business_ethics": 0.56,
|
| 6 |
+
"clinical_knowledge": 0.6716981132075471,
|
| 7 |
+
"college_biology": 0.7083333333333334,
|
| 8 |
+
"college_chemistry": 0.44,
|
| 9 |
+
"college_computer_science": 0.46,
|
| 10 |
+
"college_mathematics": 0.27,
|
| 11 |
+
"college_medicine": 0.5953757225433526,
|
| 12 |
+
"college_physics": 0.35294117647058826,
|
| 13 |
+
"computer_security": 0.71,
|
| 14 |
+
"conceptual_physics": 0.5148936170212766,
|
| 15 |
+
"econometrics": 0.4824561403508772,
|
| 16 |
+
"electrical_engineering": 0.5724137931034483,
|
| 17 |
+
"elementary_mathematics": 0.3994708994708995,
|
| 18 |
+
"formal_logic": 0.42063492063492064,
|
| 19 |
+
"global_facts": 0.39,
|
| 20 |
+
"high_school_biology": 0.7580645161290323,
|
| 21 |
+
"high_school_chemistry": 0.47783251231527096,
|
| 22 |
+
"high_school_computer_science": 0.63,
|
| 23 |
+
"high_school_european_history": 0.7333333333333333,
|
| 24 |
+
"high_school_geography": 0.7323232323232324,
|
| 25 |
+
"high_school_government_and_politics": 0.8497409326424871,
|
| 26 |
+
"high_school_macroeconomics": 0.558974358974359,
|
| 27 |
+
"high_school_mathematics": 0.3037037037037037,
|
| 28 |
+
"high_school_microeconomics": 0.5966386554621849,
|
| 29 |
+
"high_school_physics": 0.2980132450331126,
|
| 30 |
+
"high_school_psychology": 0.8091743119266055,
|
| 31 |
+
"high_school_statistics": 0.4351851851851852,
|
| 32 |
+
"high_school_us_history": 0.7990196078431373,
|
| 33 |
+
"high_school_world_history": 0.7637130801687764,
|
| 34 |
+
"human_aging": 0.6502242152466368,
|
| 35 |
+
"human_sexuality": 0.732824427480916,
|
| 36 |
+
"international_law": 0.7107438016528925,
|
| 37 |
+
"jurisprudence": 0.7592592592592593,
|
| 38 |
+
"logical_fallacies": 0.7361963190184049,
|
| 39 |
+
"machine_learning": 0.5446428571428571,
|
| 40 |
+
"management": 0.7475728155339806,
|
| 41 |
+
"marketing": 0.811965811965812,
|
| 42 |
+
"medical_genetics": 0.75,
|
| 43 |
+
"miscellaneous": 0.7547892720306514,
|
| 44 |
+
"moral_disputes": 0.6791907514450867,
|
| 45 |
+
"moral_scenarios": 0.2670391061452514,
|
| 46 |
+
"nutrition": 0.6699346405228758,
|
| 47 |
+
"philosophy": 0.6495176848874598,
|
| 48 |
+
"prehistory": 0.7006172839506173,
|
| 49 |
+
"professional_accounting": 0.450354609929078,
|
| 50 |
+
"professional_law": 0.45697522816166886,
|
| 51 |
+
"professional_medicine": 0.6433823529411765,
|
| 52 |
+
"professional_psychology": 0.6241830065359477,
|
| 53 |
+
"public_relations": 0.6727272727272727,
|
| 54 |
+
"security_studies": 0.6775510204081633,
|
| 55 |
+
"sociology": 0.8159203980099502,
|
| 56 |
+
"us_foreign_policy": 0.84,
|
| 57 |
+
"virology": 0.5240963855421686,
|
| 58 |
+
"world_religions": 0.8070175438596491
|
| 59 |
+
}
|