Update README.md
Browse files
README.md
CHANGED
|
@@ -21,81 +21,6 @@ datasets:
|
|
| 21 |
|[minghaowu/phi-2-OpenHermes-2.5](https://huggingface.co/minghaowu/phi-2-OpenHermes-2.5)| 27.95| 67.55| 48.07| 36.17| 44.94|
|
| 22 |
|[phi-2](https://huggingface.co/microsoft/phi-2)| 27.96| 70.84| 44.46| 35.17| 44.61|
|
| 23 |
|
| 24 |
-
### AGIEval
|
| 25 |
-
| Task |Version| Metric |Value| |Stderr|
|
| 26 |
-
|------------------------------|------:|--------|----:|---|-----:|
|
| 27 |
-
|agieval_aqua_rat | 0|acc |18.90|± | 2.46|
|
| 28 |
-
| | |acc_norm|19.69|± | 2.50|
|
| 29 |
-
|agieval_logiqa_en | 0|acc |28.73|± | 1.77|
|
| 30 |
-
| | |acc_norm|31.80|± | 1.83|
|
| 31 |
-
|agieval_lsat_ar | 0|acc |19.13|± | 2.60|
|
| 32 |
-
| | |acc_norm|19.57|± | 2.62|
|
| 33 |
-
|agieval_lsat_lr | 0|acc |30.20|± | 2.03|
|
| 34 |
-
| | |acc_norm|28.04|± | 1.99|
|
| 35 |
-
|agieval_lsat_rc | 0|acc |37.92|± | 2.96|
|
| 36 |
-
| | |acc_norm|32.71|± | 2.87|
|
| 37 |
-
|agieval_sat_en | 0|acc |52.91|± | 3.49|
|
| 38 |
-
| | |acc_norm|47.57|± | 3.49|
|
| 39 |
-
|agieval_sat_en_without_passage| 0|acc |39.32|± | 3.41|
|
| 40 |
-
| | |acc_norm|36.41|± | 3.36|
|
| 41 |
-
|agieval_sat_math | 0|acc |30.00|± | 3.10|
|
| 42 |
-
| | |acc_norm|26.36|± | 2.98|
|
| 43 |
-
|
| 44 |
-
Average: 30.27%
|
| 45 |
-
|
| 46 |
-
### GPT4All
|
| 47 |
-
| Task |Version| Metric |Value| |Stderr|
|
| 48 |
-
|-------------|------:|--------|----:|---|-----:|
|
| 49 |
-
|arc_challenge| 0|acc |50.94|± | 1.46|
|
| 50 |
-
| | |acc_norm|53.24|± | 1.46|
|
| 51 |
-
|arc_easy | 0|acc |80.77|± | 0.81|
|
| 52 |
-
| | |acc_norm|78.70|± | 0.84|
|
| 53 |
-
|boolq | 1|acc |84.22|± | 0.64|
|
| 54 |
-
|hellaswag | 0|acc |55.94|± | 0.50|
|
| 55 |
-
| | |acc_norm|73.80|± | 0.44|
|
| 56 |
-
|openbookqa | 0|acc |38.80|± | 2.18|
|
| 57 |
-
| | |acc_norm|50.60|± | 2.24|
|
| 58 |
-
|piqa | 0|acc |79.11|± | 0.95|
|
| 59 |
-
| | |acc_norm|79.98|± | 0.93|
|
| 60 |
-
|winogrande | 0|acc |77.74|± | 1.17|
|
| 61 |
-
|
| 62 |
-
Average: 71.18%
|
| 63 |
-
|
| 64 |
-
### TruthfulQA
|
| 65 |
-
| Task |Version|Metric|Value| |Stderr|
|
| 66 |
-
|-------------|------:|------|----:|---|-----:|
|
| 67 |
-
|truthfulqa_mc| 1|mc1 |30.72|± | 1.62|
|
| 68 |
-
| | |mc2 |43.87|± | 1.52|
|
| 69 |
-
|
| 70 |
-
Average: 43.87%
|
| 71 |
-
|
| 72 |
-
### Bigbench
|
| 73 |
-
| Task |Version| Metric |Value| |Stderr|
|
| 74 |
-
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|
| 75 |
-
|bigbench_causal_judgement | 0|multiple_choice_grade|61.05|± | 3.55|
|
| 76 |
-
|bigbench_date_understanding | 0|multiple_choice_grade|59.35|± | 2.56|
|
| 77 |
-
|bigbench_disambiguation_qa | 0|multiple_choice_grade|40.31|± | 3.06|
|
| 78 |
-
|bigbench_geometric_shapes | 0|multiple_choice_grade|10.03|± | 1.59|
|
| 79 |
-
| | |exact_str_match | 6.69|± | 1.32|
|
| 80 |
-
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|25.40|± | 1.95|
|
| 81 |
-
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|16.71|± | 1.41|
|
| 82 |
-
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|41.67|± | 2.85|
|
| 83 |
-
|bigbench_movie_recommendation | 0|multiple_choice_grade|39.40|± | 2.19|
|
| 84 |
-
|bigbench_navigate | 0|multiple_choice_grade|50.00|± | 1.58|
|
| 85 |
-
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|56.10|± | 1.11|
|
| 86 |
-
|bigbench_ruin_names | 0|multiple_choice_grade|26.12|± | 2.08|
|
| 87 |
-
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|24.95|± | 1.37|
|
| 88 |
-
|bigbench_snarks | 0|multiple_choice_grade|58.56|± | 3.67|
|
| 89 |
-
|bigbench_sports_understanding | 0|multiple_choice_grade|50.00|± | 1.59|
|
| 90 |
-
|bigbench_temporal_sequences | 0|multiple_choice_grade|14.60|± | 1.12|
|
| 91 |
-
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|17.52|± | 1.08|
|
| 92 |
-
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|12.80|± | 0.80|
|
| 93 |
-
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|41.67|± | 2.85|
|
| 94 |
-
|
| 95 |
-
Average: 35.9%
|
| 96 |
-
|
| 97 |
-
Average score: 45.3%
|
| 98 |
-
|
| 99 |
## Inference
|
| 100 |
```python
|
| 101 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 21 |
|[minghaowu/phi-2-OpenHermes-2.5](https://huggingface.co/minghaowu/phi-2-OpenHermes-2.5)| 27.95| 67.55| 48.07| 36.17| 44.94|
|
| 22 |
|[phi-2](https://huggingface.co/microsoft/phi-2)| 27.96| 70.84| 44.46| 35.17| 44.61|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
## Inference
|
| 25 |
```python
|
| 26 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|