Spaces:
Sleeping
Sleeping
Update tasks
Browse files- data/results.json +10 -10
- data/tasks.json +3 -3
data/results.json
CHANGED
|
@@ -105,43 +105,43 @@
|
|
| 105 |
},
|
| 106 |
"gpqa_diamond": {
|
| 107 |
"accuracy": 0.4318181818181818,
|
| 108 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
| 109 |
},
|
| 110 |
"winogrande": {
|
| 111 |
"accuracy": 0.8666140489344909,
|
| 112 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
| 113 |
},
|
| 114 |
"gsm8k": {
|
| 115 |
"accuracy": 0.9469294920394238,
|
| 116 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
| 117 |
},
|
| 118 |
"math": {
|
| 119 |
"accuracy": 0.6004,
|
| 120 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
| 121 |
},
|
| 122 |
"ifeval": {
|
| 123 |
"final_acc": 0.8604907201780166,
|
| 124 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
| 125 |
},
|
| 126 |
"arc_challenge": {
|
| 127 |
"accuracy": 0.9445392491467577,
|
| 128 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
| 129 |
},
|
| 130 |
"arc_easy": {
|
| 131 |
"accuracy": 0.9823232323232324,
|
| 132 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
| 133 |
},
|
| 134 |
"mmlu_pro": {
|
| 135 |
"accuracy": 0.6688829787234043,
|
| 136 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
| 137 |
},
|
| 138 |
"humaneval": {
|
| 139 |
"mean": 0.7865853658536586,
|
| 140 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
| 141 |
},
|
| 142 |
"mmlu": {
|
| 143 |
"accuracy": 0.8033755875231449,
|
| 144 |
-
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
| 145 |
},
|
| 146 |
"mmmu_multiple_choice": {
|
| 147 |
"accuracy": null,
|
|
|
|
| 105 |
},
|
| 106 |
"gpqa_diamond": {
|
| 107 |
"accuracy": 0.4318181818181818,
|
| 108 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
| 109 |
},
|
| 110 |
"winogrande": {
|
| 111 |
"accuracy": 0.8666140489344909,
|
| 112 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
| 113 |
},
|
| 114 |
"gsm8k": {
|
| 115 |
"accuracy": 0.9469294920394238,
|
| 116 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
| 117 |
},
|
| 118 |
"math": {
|
| 119 |
"accuracy": 0.6004,
|
| 120 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
| 121 |
},
|
| 122 |
"ifeval": {
|
| 123 |
"final_acc": 0.8604907201780166,
|
| 124 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
| 125 |
},
|
| 126 |
"arc_challenge": {
|
| 127 |
"accuracy": 0.9445392491467577,
|
| 128 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
| 129 |
},
|
| 130 |
"arc_easy": {
|
| 131 |
"accuracy": 0.9823232323232324,
|
| 132 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
| 133 |
},
|
| 134 |
"mmlu_pro": {
|
| 135 |
"accuracy": 0.6688829787234043,
|
| 136 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
| 137 |
},
|
| 138 |
"humaneval": {
|
| 139 |
"mean": 0.7865853658536586,
|
| 140 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
| 141 |
},
|
| 142 |
"mmlu": {
|
| 143 |
"accuracy": 0.8033755875231449,
|
| 144 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
| 145 |
},
|
| 146 |
"mmmu_multiple_choice": {
|
| 147 |
"accuracy": null,
|
data/tasks.json
CHANGED
|
@@ -2,14 +2,14 @@
|
|
| 2 |
"arc_easy": {
|
| 3 |
"benchmark": "arc_easy",
|
| 4 |
"metric": "accuracy",
|
| 5 |
-
"display_name": "ARC-
|
| 6 |
"type": "base",
|
| 7 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 8 |
},
|
| 9 |
"arc_challenge": {
|
| 10 |
"benchmark": "arc_challenge",
|
| 11 |
"metric": "accuracy",
|
| 12 |
-
"display_name": "ARC-
|
| 13 |
"type": "base",
|
| 14 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 15 |
},
|
|
@@ -79,7 +79,7 @@
|
|
| 79 |
"gpqa_diamond": {
|
| 80 |
"benchmark": "gpqa_diamond",
|
| 81 |
"metric": "accuracy",
|
| 82 |
-
"display_name": "GPQA-
|
| 83 |
"type": "base",
|
| 84 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
| 85 |
},
|
|
|
|
| 2 |
"arc_easy": {
|
| 3 |
"benchmark": "arc_easy",
|
| 4 |
"metric": "accuracy",
|
| 5 |
+
"display_name": "ARC-E",
|
| 6 |
"type": "base",
|
| 7 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 8 |
},
|
| 9 |
"arc_challenge": {
|
| 10 |
"benchmark": "arc_challenge",
|
| 11 |
"metric": "accuracy",
|
| 12 |
+
"display_name": "ARC-C",
|
| 13 |
"type": "base",
|
| 14 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 15 |
},
|
|
|
|
| 79 |
"gpqa_diamond": {
|
| 80 |
"benchmark": "gpqa_diamond",
|
| 81 |
"metric": "accuracy",
|
| 82 |
+
"display_name": "GPQA-D",
|
| 83 |
"type": "base",
|
| 84 |
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
| 85 |
},
|