Update README.md
Browse files
README.md
CHANGED
|
@@ -62,30 +62,29 @@ Average: 75.9% without mmlu
|
|
| 62 |
| | |mc2 |77.90|± | 1.37|
|
| 63 |
|
| 64 |
### BigBench Reasoning Test
|
| 65 |
-
|
| 66 |
-
| Task | Version | Metric
|
| 67 |
-
|
| 68 |
-
| bigbench_causal_judgement | 0| multiple_choice_grade
|
| 69 |
-
| bigbench_date_understanding | 0| multiple_choice_grade
|
| 70 |
-
| bigbench_disambiguation_qa | 0| multiple_choice_grade
|
| 71 |
-
| bigbench_geometric_shapes | 0| multiple_choice_grade
|
| 72 |
-
| ... | | exact_str_match |
|
| 73 |
-
| bigbench_geometric_shapes | 0| exact_str_match |
|
| 74 |
-
| bigbench_logical_deduction_five_objects | 0| multiple_choice_grade
|
| 75 |
-
| bigbench_logical_deduction_seven_objects | 0| multiple_choice_grade
|
| 76 |
-
| bigbench_logical_deduction_three_objects | 0| multiple_choice_grade
|
| 77 |
-
| bigbench_movie_recommendation | 0| multiple_choice_grade
|
| 78 |
-
| bigbench_navigate | 0| multiple_choice_grade
|
| 79 |
-
| bigbench_reasoning_about_colored_objects | 0| multiple_choice_grade
|
| 80 |
-
| bigbench_ruin_names | 0| multiple_choice_grade
|
| 81 |
-
| bigbench_salient_translation_error_detection | 0| multiple_choice_grade
|
| 82 |
-
| bigbench_snarks | 0| multiple_choice_grade
|
| 83 |
-
| bigbench_sports_understanding | 0| multiple_choice_grade
|
| 84 |
-
| bigbench_temporal_sequences | 0| multiple_choice_grade
|
| 85 |
-
| bigbench_tracking_shuffled_objects_five_objects| 0| multiple_choice_grade
|
| 86 |
-
| bigbench_tracking_shuffled_objects_seven_objects| 0| multiple_choice_grade
|
| 87 |
-
| bigbench_tracking_shuffled_objects_three_objects| 0| multiple_choice_grade
|
| 88 |
-
```
|
| 89 |
Average: 49.08%
|
| 90 |
|
| 91 |
|
|
|
|
| 62 |
| | |mc2 |77.90|± | 1.37|
|
| 63 |
|
| 64 |
### BigBench Reasoning Test
|
| 65 |
+
|
| 66 |
+
| Task | Version | Metric | Value | | Stderr|
|
| 67 |
+
|------------------------------------------------|---------|------------------------|-----------|---|-------|
|
| 68 |
+
| bigbench_causal_judgement | 0| multiple_choice_grade | 60.00 | _ | 3.56 |
|
| 69 |
+
| bigbench_date_understanding | 0| multiple_choice_grade | 62.06 | _ | 2.53 |
|
| 70 |
+
| bigbench_disambiguation_qa | 0| multiple_choice_grade | 54.26 | _ | 3.11 |
|
| 71 |
+
| bigbench_geometric_shapes | 0| multiple_choice_grade | 23.96 | _ | 2.26 |
|
| 72 |
+
| ... | | exact_str_match | | | |
|
| 73 |
+
| bigbench_geometric_shapes | 0| exact_str_match | 0.00 | _ | 0.00 |
|
| 74 |
+
| bigbench_logical_deduction_five_objects | 0| multiple_choice_grade | 32.80 | _ | 2.10 |
|
| 75 |
+
| bigbench_logical_deduction_seven_objects | 0| multiple_choice_grade | 23.86 | _ | 1.61 |
|
| 76 |
+
| bigbench_logical_deduction_three_objects | 0| multiple_choice_grade | 59.33 | _ | 2.84 |
|
| 77 |
+
| bigbench_movie_recommendation | 0| multiple_choice_grade | 58.00 | _ | 2.21 |
|
| 78 |
+
| bigbench_navigate | 0| multiple_choice_grade | 56.00 | _ | 1.57 |
|
| 79 |
+
| bigbench_reasoning_about_colored_objects | 0| multiple_choice_grade | 69.20 | _ | 1.03 |
|
| 80 |
+
| bigbench_ruin_names | 0| multiple_choice_grade | 55.36 | _ | 2.35 |
|
| 81 |
+
| bigbench_salient_translation_error_detection | 0| multiple_choice_grade | 41.48 | _ | 1.56 |
|
| 82 |
+
| bigbench_snarks | 0| multiple_choice_grade | 73.48 | _ | 3.29 |
|
| 83 |
+
| bigbench_sports_understanding | 0| multiple_choice_grade | 76.06 | _ | 1.36 |
|
| 84 |
+
| bigbench_temporal_sequences | 0| multiple_choice_grade | 55.50 | _ | 1.57 |
|
| 85 |
+
| bigbench_tracking_shuffled_objects_five_objects| 0| multiple_choice_grade | 23.28 | _ | 1.20 |
|
| 86 |
+
| bigbench_tracking_shuffled_objects_seven_objects| 0| multiple_choice_grade | 19.37 | _ | 0.94 |
|
| 87 |
+
| bigbench_tracking_shuffled_objects_three_objects| 0| multiple_choice_grade | 59.33 | _ | 2.84 |
|
|
|
|
| 88 |
Average: 49.08%
|
| 89 |
|
| 90 |
|