Init commit with simple table
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +110 -0
- evals/arc-challenge/arc_ar_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_ar_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_ar_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_ar_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_ar_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_ar_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_ar_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_bn_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_bn_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_bn_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_bn_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_bn_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_bn_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_bn_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_ca_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_ca_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_ca_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_ca_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_ca_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_ca_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_ca_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_da_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_da_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_da_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_da_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_da_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_da_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_da_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_de_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_de_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_de_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_de_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_de_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_de_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_de_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_es_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_es_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_es_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_es_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_es_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_es_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_es_challenge_llama-7B.json +23 -0
- evals/arc-challenge/arc_eu_challenge_bloom-1b7.json +23 -0
- evals/arc-challenge/arc_eu_challenge_bloom-560.json +23 -0
- evals/arc-challenge/arc_eu_challenge_bloom-7b1.json +23 -0
- evals/arc-challenge/arc_eu_challenge_gpt2-large.json +23 -0
- evals/arc-challenge/arc_eu_challenge_gpt2-medium.json +23 -0
- evals/arc-challenge/arc_eu_challenge_gpt2.json +23 -0
- evals/arc-challenge/arc_eu_challenge_llama-7B.json +23 -0
app.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import glob
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
import glob
|
| 8 |
+
|
| 9 |
+
ARC = "arc_challenge"
|
| 10 |
+
HELLASWAG = "hellaswag"
|
| 11 |
+
MMLU = "mmlu"
|
| 12 |
+
TRUTHFULQA = "truthfulqa-mc"
|
| 13 |
+
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
| 14 |
+
|
| 15 |
+
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def collect_results():
|
| 19 |
+
performance_dict = defaultdict(dict)
|
| 20 |
+
pretrained_models = set()
|
| 21 |
+
for file in glob.glob('evals/*/*.json'):
|
| 22 |
+
with open(file, 'r') as f:
|
| 23 |
+
data = json.load(f)
|
| 24 |
+
if 'results' not in data:
|
| 25 |
+
continue
|
| 26 |
+
if 'config' not in data:
|
| 27 |
+
continue
|
| 28 |
+
results = data['results']
|
| 29 |
+
config = data['config']
|
| 30 |
+
if 'model_args' not in config:
|
| 31 |
+
continue
|
| 32 |
+
|
| 33 |
+
model_args = config['model_args'].split(',')
|
| 34 |
+
pretrained = [x for x in model_args if x.startswith('pretrained=')]
|
| 35 |
+
if len(pretrained) != 1:
|
| 36 |
+
continue
|
| 37 |
+
pretrained = pretrained[0].split('=')[1]
|
| 38 |
+
pretrained = pretrained.split('/')[-1]
|
| 39 |
+
pretrained_models.add(pretrained)
|
| 40 |
+
|
| 41 |
+
for lang_task, perfs in results.items():
|
| 42 |
+
if lang_task.startswith('arc_') and lang_task.endswith('_challenge'):
|
| 43 |
+
lang = lang_task.split('_')[1]
|
| 44 |
+
task = ARC
|
| 45 |
+
elif lang_task.startswith('hellaswag_'):
|
| 46 |
+
_, lang = lang_task.split('_')
|
| 47 |
+
task = HELLASWAG
|
| 48 |
+
elif lang_task.startswith('mmlu_'):
|
| 49 |
+
_, lang = lang_task.split('_')
|
| 50 |
+
task = MMLU
|
| 51 |
+
elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
|
| 52 |
+
lang = lang_task.split('_')[1]
|
| 53 |
+
task = TRUTHFULQA
|
| 54 |
+
|
| 55 |
+
if lang and task:
|
| 56 |
+
metric = METRICS[BENCHMARKS.index(task)]
|
| 57 |
+
p = round(perfs[metric] * 100, 1)
|
| 58 |
+
performance_dict[(pretrained, lang)][task] = p
|
| 59 |
+
return performance_dict, pretrained_models
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_leaderboard_df(performance_dict, pretrained_models):
|
| 63 |
+
df = list()
|
| 64 |
+
for (pretrained, lang), perfs in performance_dict.items():
|
| 65 |
+
arc_perf = perfs.get(ARC, 0.0)
|
| 66 |
+
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
| 67 |
+
mmlu_perf = perfs.get(MMLU, 0.0)
|
| 68 |
+
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
| 69 |
+
|
| 70 |
+
if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
|
| 71 |
+
continue
|
| 72 |
+
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
| 73 |
+
row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
| 74 |
+
df.append(row)
|
| 75 |
+
return df
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
MODEL_COL = "Model"
|
| 79 |
+
LANG_COL = "Language"
|
| 80 |
+
AVERAGE_COL = "Average"
|
| 81 |
+
ARC_COL = "ARC (25-shot)"
|
| 82 |
+
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
| 83 |
+
MMLU_COL = "MMLU (5-shot))️"
|
| 84 |
+
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
| 85 |
+
|
| 86 |
+
COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
| 87 |
+
TYPES = ["str", "str", "number", "number", "number", "number", "number"]
|
| 88 |
+
|
| 89 |
+
args = collect_results()
|
| 90 |
+
leaderboard_df = get_leaderboard_df(*args)
|
| 91 |
+
|
| 92 |
+
demo = gr.Blocks()
|
| 93 |
+
with demo:
|
| 94 |
+
gr.HTML('Open Multilingual Large Language Model Evaluation Leaderboard')
|
| 95 |
+
gr.Markdown('INTRODUCTION TEXT', elem_classes="markdown-text")
|
| 96 |
+
|
| 97 |
+
with gr.Box():
|
| 98 |
+
search_bar = gr.Textbox(
|
| 99 |
+
placeholder="Search models...", show_label=False, elem_id="search-bar"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
leaderboard_table = gr.components.Dataframe(
|
| 103 |
+
value=leaderboard_df,
|
| 104 |
+
headers=COLS,
|
| 105 |
+
datatype=TYPES,
|
| 106 |
+
max_rows=5,
|
| 107 |
+
elem_id="leaderboard-table",
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
demo.launch()
|
evals/arc-challenge/arc_ar_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.22818791946308725,
|
| 5 |
+
"acc_stderr": 0.02435139725761051,
|
| 6 |
+
"acc_norm": 0.2516778523489933,
|
| 7 |
+
"acc_norm_stderr": 0.025181904610615872
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.2550335570469799,
|
| 5 |
+
"acc_stderr": 0.025292327380712708,
|
| 6 |
+
"acc_norm": 0.2550335570469799,
|
| 7 |
+
"acc_norm_stderr": 0.025292327380712708
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.28187919463087246,
|
| 5 |
+
"acc_stderr": 0.026106703750007426,
|
| 6 |
+
"acc_norm": 0.3087248322147651,
|
| 7 |
+
"acc_norm_stderr": 0.026806063072940547
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.20134228187919462,
|
| 5 |
+
"acc_stderr": 0.023268565767685306,
|
| 6 |
+
"acc_norm": 0.21476510067114093,
|
| 7 |
+
"acc_norm_stderr": 0.023828868848284352
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.19463087248322147,
|
| 5 |
+
"acc_stderr": 0.022973392306598162,
|
| 6 |
+
"acc_norm": 0.21140939597315436,
|
| 7 |
+
"acc_norm_stderr": 0.02369243605357901
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.20134228187919462,
|
| 5 |
+
"acc_stderr": 0.023268565767685313,
|
| 6 |
+
"acc_norm": 0.22483221476510068,
|
| 7 |
+
"acc_norm_stderr": 0.024224169829650755
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ar_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ar_challenge": {
|
| 4 |
+
"acc": 0.22483221476510068,
|
| 5 |
+
"acc_stderr": 0.02422416982965075,
|
| 6 |
+
"acc_norm": 0.24161073825503357,
|
| 7 |
+
"acc_norm_stderr": 0.024838535108028477
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ar_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.20945945945945946,
|
| 5 |
+
"acc_stderr": 0.023691963473475724,
|
| 6 |
+
"acc_norm": 0.2533783783783784,
|
| 7 |
+
"acc_norm_stderr": 0.025323518629100008
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.22972972972972974,
|
| 5 |
+
"acc_stderr": 0.024491712953916975,
|
| 6 |
+
"acc_norm": 0.24662162162162163,
|
| 7 |
+
"acc_norm_stderr": 0.025096383517594287
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.23986486486486486,
|
| 5 |
+
"acc_stderr": 0.02486094967084638,
|
| 6 |
+
"acc_norm": 0.28040540540540543,
|
| 7 |
+
"acc_norm_stderr": 0.026153277917823237
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.2195945945945946,
|
| 5 |
+
"acc_stderr": 0.024102381106046785,
|
| 6 |
+
"acc_norm": 0.2668918918918919,
|
| 7 |
+
"acc_norm_stderr": 0.025753762926257924
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.20608108108108109,
|
| 5 |
+
"acc_stderr": 0.02355028295929425,
|
| 6 |
+
"acc_norm": 0.24662162162162163,
|
| 7 |
+
"acc_norm_stderr": 0.02509638351759427
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.22635135135135134,
|
| 5 |
+
"acc_stderr": 0.024364215012920555,
|
| 6 |
+
"acc_norm": 0.2668918918918919,
|
| 7 |
+
"acc_norm_stderr": 0.025753762926257917
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_bn_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_bn_challenge": {
|
| 4 |
+
"acc": 0.22635135135135134,
|
| 5 |
+
"acc_stderr": 0.024364215012920565,
|
| 6 |
+
"acc_norm": 0.26013513513513514,
|
| 7 |
+
"acc_norm_stderr": 0.02554257639364025
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_bn_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.2356902356902357,
|
| 5 |
+
"acc_stderr": 0.02466946003490763,
|
| 6 |
+
"acc_norm": 0.27946127946127947,
|
| 7 |
+
"acc_norm_stderr": 0.026082164400369843
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.2053872053872054,
|
| 5 |
+
"acc_stderr": 0.02348110951859932,
|
| 6 |
+
"acc_norm": 0.23232323232323232,
|
| 7 |
+
"acc_norm_stderr": 0.02454650495612789
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.3164983164983165,
|
| 5 |
+
"acc_stderr": 0.02703395838420779,
|
| 6 |
+
"acc_norm": 0.3434343434343434,
|
| 7 |
+
"acc_norm_stderr": 0.0276003816062635
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.20875420875420875,
|
| 5 |
+
"acc_stderr": 0.02362258775627148,
|
| 6 |
+
"acc_norm": 0.22895622895622897,
|
| 7 |
+
"acc_norm_stderr": 0.02442136264227106
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.20875420875420875,
|
| 5 |
+
"acc_stderr": 0.023622587756271473,
|
| 6 |
+
"acc_norm": 0.21212121212121213,
|
| 7 |
+
"acc_norm_stderr": 0.023761611918761673
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.21885521885521886,
|
| 5 |
+
"acc_stderr": 0.024032467624412215,
|
| 6 |
+
"acc_norm": 0.21885521885521886,
|
| 7 |
+
"acc_norm_stderr": 0.02403246762441221
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_ca_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_ca_challenge": {
|
| 4 |
+
"acc": 0.29292929292929293,
|
| 5 |
+
"acc_stderr": 0.026452514969665927,
|
| 6 |
+
"acc_norm": 0.29292929292929293,
|
| 7 |
+
"acc_norm_stderr": 0.02645251496966592
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_ca_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.2255892255892256,
|
| 5 |
+
"acc_stderr": 0.02429399929295737,
|
| 6 |
+
"acc_norm": 0.26262626262626265,
|
| 7 |
+
"acc_norm_stderr": 0.02557802773320011
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.25925925925925924,
|
| 5 |
+
"acc_stderr": 0.025471492792791667,
|
| 6 |
+
"acc_norm": 0.24579124579124578,
|
| 7 |
+
"acc_norm_stderr": 0.025025521384235284
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.24242424242424243,
|
| 5 |
+
"acc_stderr": 0.02490893747050877,
|
| 6 |
+
"acc_norm": 0.24915824915824916,
|
| 7 |
+
"acc_norm_stderr": 0.025140041284626418
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.23232323232323232,
|
| 5 |
+
"acc_stderr": 0.02454650495612789,
|
| 6 |
+
"acc_norm": 0.24242424242424243,
|
| 7 |
+
"acc_norm_stderr": 0.024908937470508753
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.24579124579124578,
|
| 5 |
+
"acc_stderr": 0.0250255213842353,
|
| 6 |
+
"acc_norm": 0.2727272727272727,
|
| 7 |
+
"acc_norm_stderr": 0.025886127156886297
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.2222222222222222,
|
| 5 |
+
"acc_stderr": 0.02416437978893547,
|
| 6 |
+
"acc_norm": 0.23905723905723905,
|
| 7 |
+
"acc_norm_stderr": 0.024790260423468984
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_da_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_da_challenge": {
|
| 4 |
+
"acc": 0.3063973063973064,
|
| 5 |
+
"acc_stderr": 0.026794891419479452,
|
| 6 |
+
"acc_norm": 0.3367003367003367,
|
| 7 |
+
"acc_norm_stderr": 0.02746823841289221
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_da_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.24496644295302014,
|
| 5 |
+
"acc_stderr": 0.024955035980898946,
|
| 6 |
+
"acc_norm": 0.2953020134228188,
|
| 7 |
+
"acc_norm_stderr": 0.026470155629081085
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.2348993288590604,
|
| 5 |
+
"acc_stderr": 0.024599255015999244,
|
| 6 |
+
"acc_norm": 0.28187919463087246,
|
| 7 |
+
"acc_norm_stderr": 0.026106703750007426
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.2684563758389262,
|
| 5 |
+
"acc_stderr": 0.0257145395148175,
|
| 6 |
+
"acc_norm": 0.2684563758389262,
|
| 7 |
+
"acc_norm_stderr": 0.0257145395148175
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.23825503355704697,
|
| 5 |
+
"acc_stderr": 0.024719951493159625,
|
| 6 |
+
"acc_norm": 0.27181208053691275,
|
| 7 |
+
"acc_norm_stderr": 0.025815342279487567
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.23825503355704697,
|
| 5 |
+
"acc_stderr": 0.024719951493159625,
|
| 6 |
+
"acc_norm": 0.28859060402684567,
|
| 7 |
+
"acc_norm_stderr": 0.026291942108676806
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.22483221476510068,
|
| 5 |
+
"acc_stderr": 0.02422416982965075,
|
| 6 |
+
"acc_norm": 0.21140939597315436,
|
| 7 |
+
"acc_norm_stderr": 0.02369243605357901
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_de_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_de_challenge": {
|
| 4 |
+
"acc": 0.2785234899328859,
|
| 5 |
+
"acc_stderr": 0.0260114035784859,
|
| 6 |
+
"acc_norm": 0.348993288590604,
|
| 7 |
+
"acc_norm_stderr": 0.027658144793750224
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_de_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.2356902356902357,
|
| 5 |
+
"acc_stderr": 0.02466946003490763,
|
| 6 |
+
"acc_norm": 0.2895622895622896,
|
| 7 |
+
"acc_norm_stderr": 0.026362594432681956
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.2255892255892256,
|
| 5 |
+
"acc_stderr": 0.024293999292957367,
|
| 6 |
+
"acc_norm": 0.2356902356902357,
|
| 7 |
+
"acc_norm_stderr": 0.02466946003490764
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.3265993265993266,
|
| 5 |
+
"acc_stderr": 0.027258287015652305,
|
| 6 |
+
"acc_norm": 0.3602693602693603,
|
| 7 |
+
"acc_norm_stderr": 0.02790399493827167
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.2222222222222222,
|
| 5 |
+
"acc_stderr": 0.024164379788935483,
|
| 6 |
+
"acc_norm": 0.26262626262626265,
|
| 7 |
+
"acc_norm_stderr": 0.02557802773320012
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.1919191919191919,
|
| 5 |
+
"acc_stderr": 0.022889733897083934,
|
| 6 |
+
"acc_norm": 0.25252525252525254,
|
| 7 |
+
"acc_norm_stderr": 0.02525252525252536
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.19865319865319866,
|
| 5 |
+
"acc_stderr": 0.023190610381322127,
|
| 6 |
+
"acc_norm": 0.24579124579124578,
|
| 7 |
+
"acc_norm_stderr": 0.0250255213842353
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_es_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_es_challenge": {
|
| 4 |
+
"acc": 0.3501683501683502,
|
| 5 |
+
"acc_stderr": 0.027726370308831506,
|
| 6 |
+
"acc_norm": 0.3602693602693603,
|
| 7 |
+
"acc_norm_stderr": 0.02790399493827167
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_es_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_bloom-1b7.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.22377622377622378,
|
| 5 |
+
"acc_stderr": 0.02468755105337312,
|
| 6 |
+
"acc_norm": 0.2517482517482518,
|
| 7 |
+
"acc_norm_stderr": 0.02570896966075011
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-1b7",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_bloom-560.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.24475524475524477,
|
| 5 |
+
"acc_stderr": 0.02546756553847068,
|
| 6 |
+
"acc_norm": 0.19230769230769232,
|
| 7 |
+
"acc_norm_stderr": 0.023345268410264786
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=bigscience/bloom-560m",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_bloom-7b1.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.23076923076923078,
|
| 5 |
+
"acc_stderr": 0.024957141712425013,
|
| 6 |
+
"acc_norm": 0.24125874125874125,
|
| 7 |
+
"acc_norm_stderr": 0.025343462496583764
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_gpt2-large.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.25874125874125875,
|
| 5 |
+
"acc_stderr": 0.02594151450124707,
|
| 6 |
+
"acc_norm": 0.24125874125874125,
|
| 7 |
+
"acc_norm_stderr": 0.025343462496583737
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-large",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_gpt2-medium.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.2762237762237762,
|
| 5 |
+
"acc_stderr": 0.026485626798716442,
|
| 6 |
+
"acc_norm": 0.25874125874125875,
|
| 7 |
+
"acc_norm_stderr": 0.025941514501247064
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2-medium",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_gpt2.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.2762237762237762,
|
| 5 |
+
"acc_stderr": 0.026485626798716456,
|
| 6 |
+
"acc_norm": 0.24825174825174826,
|
| 7 |
+
"acc_norm_stderr": 0.025589390464738234
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=gpt2",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|
evals/arc-challenge/arc_eu_challenge_llama-7B.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_eu_challenge": {
|
| 4 |
+
"acc": 0.26223776223776224,
|
| 5 |
+
"acc_stderr": 0.026054539173797044,
|
| 6 |
+
"acc_norm": 0.23426573426573427,
|
| 7 |
+
"acc_norm_stderr": 0.02508828621716978
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"versions": {
|
| 11 |
+
"arc_eu_challenge": 0
|
| 12 |
+
},
|
| 13 |
+
"config": {
|
| 14 |
+
"model": "hf-auto",
|
| 15 |
+
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
| 16 |
+
"batch_size": "1",
|
| 17 |
+
"device": "cuda",
|
| 18 |
+
"no_cache": false,
|
| 19 |
+
"limit": null,
|
| 20 |
+
"bootstrap_iters": 100000,
|
| 21 |
+
"description_dict": {}
|
| 22 |
+
}
|
| 23 |
+
}
|