Update README.md
Browse files
README.md
CHANGED
|
@@ -81,6 +81,81 @@ Zero-shot AGIEval
|
|
| 81 |
| - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
|
| 82 |
| | |none |None |acc_norm|0.2364|± |0.0287|
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
## How to Use
|
| 85 |
```python
|
| 86 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 81 |
| - agieval_sat_math | 1|none |None |acc |0.3091|± |0.0312|
|
| 82 |
| | |none |None |acc_norm|0.2364|± |0.0287|
|
| 83 |
|
| 84 |
+
5 shot CoT MMLU
|
| 85 |
+
|
| 86 |
+
| Tasks |Version| Filter |n-shot| Metric |Value | |Stderr|
|
| 87 |
+
|-------------------------------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
|
| 88 |
+
|mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
|
| 89 |
+
| - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
|
| 90 |
+
| - mmlu_flan_cot_fewshot_formal_logic | 0|get-answer| 0|exact_match|0.2143|± |0.1138|
|
| 91 |
+
| - mmlu_flan_cot_fewshot_high_school_european_history | 0|get-answer| 0|exact_match|0.6111|± |0.1182|
|
| 92 |
+
| - mmlu_flan_cot_fewshot_high_school_us_history | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
| 93 |
+
| - mmlu_flan_cot_fewshot_high_school_world_history | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
| 94 |
+
| - mmlu_flan_cot_fewshot_international_law | 0|get-answer| 0|exact_match|0.9231|± |0.0769|
|
| 95 |
+
| - mmlu_flan_cot_fewshot_jurisprudence | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
|
| 96 |
+
| - mmlu_flan_cot_fewshot_logical_fallacies | 0|get-answer| 0|exact_match|0.7222|± |0.1086|
|
| 97 |
+
| - mmlu_flan_cot_fewshot_moral_disputes | 0|get-answer| 0|exact_match|0.5526|± |0.0817|
|
| 98 |
+
| - mmlu_flan_cot_fewshot_moral_scenarios | 0|get-answer| 0|exact_match|0.3900|± |0.0490|
|
| 99 |
+
| - mmlu_flan_cot_fewshot_philosophy | 0|get-answer| 0|exact_match|0.7647|± |0.0738|
|
| 100 |
+
| - mmlu_flan_cot_fewshot_prehistory | 0|get-answer| 0|exact_match|0.7143|± |0.0775|
|
| 101 |
+
| - mmlu_flan_cot_fewshot_professional_law | 0|get-answer| 0|exact_match|0.3471|± |0.0366|
|
| 102 |
+
| - mmlu_flan_cot_fewshot_world_religions | 0|get-answer| 0|exact_match|0.8947|± |0.0723|
|
| 103 |
+
| - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
|
| 104 |
+
| - mmlu_flan_cot_fewshot_business_ethics | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
|
| 105 |
+
| - mmlu_flan_cot_fewshot_clinical_knowledge | 0|get-answer| 0|exact_match|0.5517|± |0.0940|
|
| 106 |
+
| - mmlu_flan_cot_fewshot_college_medicine | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
| 107 |
+
| - mmlu_flan_cot_fewshot_global_facts | 0|get-answer| 0|exact_match|0.6000|± |0.1633|
|
| 108 |
+
| - mmlu_flan_cot_fewshot_human_aging | 0|get-answer| 0|exact_match|0.6522|± |0.1015|
|
| 109 |
+
| - mmlu_flan_cot_fewshot_management | 0|get-answer| 0|exact_match|0.9091|± |0.0909|
|
| 110 |
+
| - mmlu_flan_cot_fewshot_marketing | 0|get-answer| 0|exact_match|0.8400|± |0.0748|
|
| 111 |
+
| - mmlu_flan_cot_fewshot_medical_genetics | 0|get-answer| 0|exact_match|1.0000|± |0.0000|
|
| 112 |
+
| - mmlu_flan_cot_fewshot_miscellaneous | 0|get-answer| 0|exact_match|0.7791|± |0.0450|
|
| 113 |
+
| - mmlu_flan_cot_fewshot_nutrition | 0|get-answer| 0|exact_match|0.6667|± |0.0833|
|
| 114 |
+
| - mmlu_flan_cot_fewshot_professional_accounting | 0|get-answer| 0|exact_match|0.4194|± |0.0901|
|
| 115 |
+
| - mmlu_flan_cot_fewshot_professional_medicine | 0|get-answer| 0|exact_match|0.6774|± |0.0853|
|
| 116 |
+
| - mmlu_flan_cot_fewshot_virology | 0|get-answer| 0|exact_match|0.3889|± |0.1182|
|
| 117 |
+
| - mmlu_flan_cot_fewshot_social_sciences |N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
|
| 118 |
+
| - mmlu_flan_cot_fewshot_econometrics | 0|get-answer| 0|exact_match|0.3333|± |0.1421|
|
| 119 |
+
| - mmlu_flan_cot_fewshot_high_school_geography | 0|get-answer| 0|exact_match|0.9091|± |0.0627|
|
| 120 |
+
| - mmlu_flan_cot_fewshot_high_school_government_and_politics| 0|get-answer| 0|exact_match|0.8095|± |0.0878|
|
| 121 |
+
| - mmlu_flan_cot_fewshot_high_school_macroeconomics | 0|get-answer| 0|exact_match|0.6279|± |0.0746|
|
| 122 |
+
| - mmlu_flan_cot_fewshot_high_school_microeconomics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
| 123 |
+
| - mmlu_flan_cot_fewshot_high_school_psychology | 0|get-answer| 0|exact_match|0.9167|± |0.0360|
|
| 124 |
+
| - mmlu_flan_cot_fewshot_human_sexuality | 0|get-answer| 0|exact_match|0.5000|± |0.1508|
|
| 125 |
+
| - mmlu_flan_cot_fewshot_professional_psychology | 0|get-answer| 0|exact_match|0.6667|± |0.0572|
|
| 126 |
+
| - mmlu_flan_cot_fewshot_public_relations | 0|get-answer| 0|exact_match|0.5833|± |0.1486|
|
| 127 |
+
| - mmlu_flan_cot_fewshot_security_studies | 0|get-answer| 0|exact_match|0.4444|± |0.0975|
|
| 128 |
+
| - mmlu_flan_cot_fewshot_sociology | 0|get-answer| 0|exact_match|0.7727|± |0.0914|
|
| 129 |
+
| - mmlu_flan_cot_fewshot_us_foreign_policy | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
|
| 130 |
+
| - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
|
| 131 |
+
| - mmlu_flan_cot_fewshot_abstract_algebra | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
|
| 132 |
+
| - mmlu_flan_cot_fewshot_anatomy | 0|get-answer| 0|exact_match|0.3571|± |0.1329|
|
| 133 |
+
| - mmlu_flan_cot_fewshot_astronomy | 0|get-answer| 0|exact_match|0.5000|± |0.1291|
|
| 134 |
+
| - mmlu_flan_cot_fewshot_college_biology | 0|get-answer| 0|exact_match|0.5625|± |0.1281|
|
| 135 |
+
| - mmlu_flan_cot_fewshot_college_chemistry | 0|get-answer| 0|exact_match|0.3750|± |0.1830|
|
| 136 |
+
| - mmlu_flan_cot_fewshot_college_computer_science | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
|
| 137 |
+
| - mmlu_flan_cot_fewshot_college_mathematics | 0|get-answer| 0|exact_match|0.2727|± |0.1408|
|
| 138 |
+
| - mmlu_flan_cot_fewshot_college_physics | 0|get-answer| 0|exact_match|0.4545|± |0.1575|
|
| 139 |
+
| - mmlu_flan_cot_fewshot_computer_security | 0|get-answer| 0|exact_match|0.7273|± |0.1408|
|
| 140 |
+
| - mmlu_flan_cot_fewshot_conceptual_physics | 0|get-answer| 0|exact_match|0.6154|± |0.0973|
|
| 141 |
+
| - mmlu_flan_cot_fewshot_electrical_engineering | 0|get-answer| 0|exact_match|0.6875|± |0.1197|
|
| 142 |
+
| - mmlu_flan_cot_fewshot_elementary_mathematics | 0|get-answer| 0|exact_match|0.7317|± |0.0701|
|
| 143 |
+
| - mmlu_flan_cot_fewshot_high_school_biology | 0|get-answer| 0|exact_match|0.7188|± |0.0808|
|
| 144 |
+
| - mmlu_flan_cot_fewshot_high_school_chemistry | 0|get-answer| 0|exact_match|0.3636|± |0.1050|
|
| 145 |
+
| - mmlu_flan_cot_fewshot_high_school_computer_science | 0|get-answer| 0|exact_match|0.6667|± |0.1667|
|
| 146 |
+
| - mmlu_flan_cot_fewshot_high_school_mathematics | 0|get-answer| 0|exact_match|0.4138|± |0.0931|
|
| 147 |
+
| - mmlu_flan_cot_fewshot_high_school_physics | 0|get-answer| 0|exact_match|0.2353|± |0.1060|
|
| 148 |
+
| - mmlu_flan_cot_fewshot_high_school_statistics | 0|get-answer| 0|exact_match|0.4348|± |0.1057|
|
| 149 |
+
| - mmlu_flan_cot_fewshot_machine_learning | 0|get-answer| 0|exact_match|0.3636|± |0.1521|
|
| 150 |
+
|
| 151 |
+
| Groups |Version| Filter |n-shot| Metric |Value | |Stderr|
|
| 152 |
+
|----------------------------------------|-------|----------|-----:|-----------|-----:|---|-----:|
|
| 153 |
+
|mmlu_flan_cot_fewshot |N/A |get-answer| 0|exact_match|0.5924|± |0.0118|
|
| 154 |
+
| - mmlu_flan_cot_fewshot_humanities |N/A |get-answer| 0|exact_match|0.5077|± |0.0206|
|
| 155 |
+
| - mmlu_flan_cot_fewshot_other |N/A |get-answer| 0|exact_match|0.6921|± |0.0240|
|
| 156 |
+
| - mmlu_flan_cot_fewshot_social_sciences|N/A |get-answer| 0|exact_match|0.6973|± |0.0239|
|
| 157 |
+
| - mmlu_flan_cot_fewshot_stem |N/A |get-answer| 0|exact_match|0.5164|± |0.0265|
|
| 158 |
+
|
| 159 |
## How to Use
|
| 160 |
```python
|
| 161 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|