Bram Vanroy
commited on
Commit
·
0658988
1
Parent(s):
351f9fe
remove mixtral - was not tested in 8-bit
Browse files- app.py +23 -6
- evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +0 -23
- evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json +0 -23
- evals/models.json +0 -8
- generate_overview_json.py +2 -1
app.py
CHANGED
|
@@ -28,6 +28,8 @@ MODEL_TYPE_EMOJIS = {
|
|
| 28 |
"RL-tuned": "🟦",
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
@dataclass
|
| 33 |
class Result:
|
|
@@ -44,12 +46,14 @@ class Result:
|
|
| 44 |
num_parameters_kmb: str = field(init=False)
|
| 45 |
|
| 46 |
def __post_init__(self):
|
| 47 |
-
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
raise ValueError(
|
| 49 |
-
f"
|
| 50 |
)
|
| 51 |
-
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned"]:
|
| 52 |
-
raise ValueError(f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned'")
|
| 53 |
|
| 54 |
field_names = {f.name for f in fields(self)}
|
| 55 |
for task_name in TASK_METRICS:
|
|
@@ -128,8 +132,10 @@ class ResultSet:
|
|
| 128 |
f" dotted;'>{result.short_name}</a>"
|
| 129 |
)
|
| 130 |
if attr == "short_name"
|
| 131 |
-
else MODEL_TYPE_EMOJIS
|
| 132 |
if attr == "model_type"
|
|
|
|
|
|
|
| 133 |
else getattr(result, attr)
|
| 134 |
for attr, col_name in self.column_names.items()
|
| 135 |
}
|
|
@@ -203,8 +209,16 @@ def collect_results() -> ResultSet:
|
|
| 203 |
|
| 204 |
if "results" not in data:
|
| 205 |
continue
|
|
|
|
| 206 |
task_results = data["results"]
|
| 207 |
short_name = pfin.stem.split("_", 2)[2].lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
if short_name not in model_results:
|
| 209 |
model_results[short_name] = {
|
| 210 |
"short_name": short_name,
|
|
@@ -228,7 +242,10 @@ with gr.Blocks() as demo:
|
|
| 228 |
gr.HTML(TITLE)
|
| 229 |
gr.Markdown(INTRO_TEXT)
|
| 230 |
|
| 231 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
results = collect_results()
|
| 234 |
|
|
|
|
| 28 |
"RL-tuned": "🟦",
|
| 29 |
}
|
| 30 |
|
| 31 |
+
NOT_GIVEN_SYMBOL = "❔"
|
| 32 |
+
|
| 33 |
|
| 34 |
@dataclass
|
| 35 |
class Result:
|
|
|
|
| 46 |
num_parameters_kmb: str = field(init=False)
|
| 47 |
|
| 48 |
def __post_init__(self):
|
| 49 |
+
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
|
| 50 |
+
raise ValueError(
|
| 51 |
+
f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned', 'not-given"
|
| 52 |
+
)
|
| 53 |
+
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
|
| 54 |
raise ValueError(
|
| 55 |
+
f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
|
| 56 |
)
|
|
|
|
|
|
|
| 57 |
|
| 58 |
field_names = {f.name for f in fields(self)}
|
| 59 |
for task_name in TASK_METRICS:
|
|
|
|
| 132 |
f" dotted;'>{result.short_name}</a>"
|
| 133 |
)
|
| 134 |
if attr == "short_name"
|
| 135 |
+
else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
|
| 136 |
if attr == "model_type"
|
| 137 |
+
else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
|
| 138 |
+
if attr == "dutch_coverage"
|
| 139 |
else getattr(result, attr)
|
| 140 |
for attr, col_name in self.column_names.items()
|
| 141 |
}
|
|
|
|
| 209 |
|
| 210 |
if "results" not in data:
|
| 211 |
continue
|
| 212 |
+
|
| 213 |
task_results = data["results"]
|
| 214 |
short_name = pfin.stem.split("_", 2)[2].lower()
|
| 215 |
+
|
| 216 |
+
if short_name not in model_info:
|
| 217 |
+
raise KeyError(
|
| 218 |
+
f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
|
| 219 |
+
f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
if short_name not in model_results:
|
| 223 |
model_results[short_name] = {
|
| 224 |
"short_name": short_name,
|
|
|
|
| 242 |
gr.HTML(TITLE)
|
| 243 |
gr.Markdown(INTRO_TEXT)
|
| 244 |
|
| 245 |
+
gr.Markdown(
|
| 246 |
+
f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
|
| 247 |
+
" All models have been benchmarked in 8-bit."
|
| 248 |
+
)
|
| 249 |
|
| 250 |
results = collect_results()
|
| 251 |
|
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.44079870480302213,
|
| 5 |
-
"acc_stderr": 0.005158280633507224,
|
| 6 |
-
"acc_norm": 0.5840259039395574,
|
| 7 |
-
"acc_norm_stderr": 0.005120942804814836
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"hellaswag_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
| 16 |
-
"batch_size": "auto",
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"results": {
|
| 3 |
-
"hellaswag_nl": {
|
| 4 |
-
"acc": 0.5143011332973556,
|
| 5 |
-
"acc_stderr": 0.0051926973681393875,
|
| 6 |
-
"acc_norm": 0.67835941716136,
|
| 7 |
-
"acc_norm_stderr": 0.004853064643337017
|
| 8 |
-
}
|
| 9 |
-
},
|
| 10 |
-
"versions": {
|
| 11 |
-
"hellaswag_nl": 1
|
| 12 |
-
},
|
| 13 |
-
"config": {
|
| 14 |
-
"model": "hf-auto",
|
| 15 |
-
"model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=auto",
|
| 16 |
-
"batch_size": 1,
|
| 17 |
-
"device": "cuda",
|
| 18 |
-
"no_cache": false,
|
| 19 |
-
"limit": null,
|
| 20 |
-
"bootstrap_iters": 100000,
|
| 21 |
-
"description_dict": {}
|
| 22 |
-
}
|
| 23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/models.json
CHANGED
|
@@ -87,14 +87,6 @@
|
|
| 87 |
"num_parameters": 7241732096,
|
| 88 |
"quantization": "8-bit"
|
| 89 |
},
|
| 90 |
-
"mixtral-8x7b-v0.1": {
|
| 91 |
-
"compute_dtype": "auto",
|
| 92 |
-
"dutch_coverage": "not-given",
|
| 93 |
-
"model_name": "mistralai/Mixtral-8x7B-v0.1",
|
| 94 |
-
"model_type": "not-given",
|
| 95 |
-
"num_parameters": 46702792704,
|
| 96 |
-
"quantization": null
|
| 97 |
-
},
|
| 98 |
"neural-chat-7b-v3-1": {
|
| 99 |
"compute_dtype": "bfloat16",
|
| 100 |
"dutch_coverage": "none",
|
|
|
|
| 87 |
"num_parameters": 7241732096,
|
| 88 |
"quantization": "8-bit"
|
| 89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"neural-chat-7b-v3-1": {
|
| 91 |
"compute_dtype": "bfloat16",
|
| 92 |
"dutch_coverage": "none",
|
generate_overview_json.py
CHANGED
|
@@ -40,7 +40,8 @@ def main():
|
|
| 40 |
"model_type": results[short_name]["model_type"]
|
| 41 |
if short_name in results and "model_type" in results[short_name]
|
| 42 |
else "not-given",
|
| 43 |
-
"dutch_coverage": results[short_name]["dutch_coverage"]
|
|
|
|
| 44 |
else "not-given",
|
| 45 |
}
|
| 46 |
|
|
|
|
| 40 |
"model_type": results[short_name]["model_type"]
|
| 41 |
if short_name in results and "model_type" in results[short_name]
|
| 42 |
else "not-given",
|
| 43 |
+
"dutch_coverage": results[short_name]["dutch_coverage"]
|
| 44 |
+
if short_name in results and "dutch_coverage" in results[short_name]
|
| 45 |
else "not-given",
|
| 46 |
}
|
| 47 |
|