Spaces:
Running
Running
new results with random
Browse files- all_results.json +0 -0
- app.py +39 -42
all_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
-
|
| 2 |
import json
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
|
|
|
|
|
|
|
| 7 |
|
| 8 |
print("Loading datasets...")
|
| 9 |
|
|
@@ -67,13 +69,13 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 67 |
|
| 68 |
try:
|
| 69 |
overall_acc = [results['overall_acc'] for results in results_list]
|
| 70 |
-
overall_acc =
|
| 71 |
|
| 72 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
| 73 |
-
consistency_score_3 =
|
| 74 |
|
| 75 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
| 76 |
-
AC3_3 =
|
| 77 |
|
| 78 |
except:
|
| 79 |
print(results_list)
|
|
@@ -126,21 +128,21 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 126 |
|
| 127 |
|
| 128 |
try:
|
| 129 |
-
English
|
| 130 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
| 131 |
-
Chinese
|
| 132 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
| 133 |
-
Filipino
|
| 134 |
-
Spanish
|
| 135 |
-
Malay
|
| 136 |
|
| 137 |
-
English
|
| 138 |
-
Vietnamese =
|
| 139 |
-
Chinese
|
| 140 |
-
Indonesian =
|
| 141 |
-
Filipino
|
| 142 |
-
Spanish
|
| 143 |
-
Malay
|
| 144 |
|
| 145 |
|
| 146 |
except:
|
|
@@ -208,13 +210,13 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
|
|
| 208 |
|
| 209 |
try:
|
| 210 |
overall_acc = [results['overall_acc'] for results in results_list]
|
| 211 |
-
overall_acc =
|
| 212 |
|
| 213 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
| 214 |
-
consistency_score_3 =
|
| 215 |
|
| 216 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
| 217 |
-
AC3_3 =
|
| 218 |
|
| 219 |
except:
|
| 220 |
print(results_list)
|
|
@@ -267,21 +269,21 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
|
|
| 267 |
|
| 268 |
|
| 269 |
try:
|
| 270 |
-
English
|
| 271 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
| 272 |
-
Chinese
|
| 273 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
| 274 |
-
Filipino
|
| 275 |
-
Spanish
|
| 276 |
-
Malay
|
| 277 |
|
| 278 |
-
English
|
| 279 |
-
Vietnamese =
|
| 280 |
-
Chinese
|
| 281 |
-
Indonesian =
|
| 282 |
-
Filipino
|
| 283 |
-
Spanish
|
| 284 |
-
Malay
|
| 285 |
|
| 286 |
|
| 287 |
except:
|
|
@@ -346,14 +348,12 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 346 |
|
| 347 |
|
| 348 |
try:
|
| 349 |
-
accuracy = [results['accuracy'] for results in results_list]
|
| 350 |
-
accuracy = sum(accuracy) / len(accuracy)
|
| 351 |
|
| 352 |
except:
|
| 353 |
print(results_list)
|
| 354 |
accuracy = -1
|
| 355 |
|
| 356 |
-
|
| 357 |
res = {
|
| 358 |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
| 359 |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
|
@@ -401,8 +401,7 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 401 |
|
| 402 |
|
| 403 |
try:
|
| 404 |
-
accuracy = [results['accuracy'] for results in results_list]
|
| 405 |
-
accuracy = sum(accuracy) / len(accuracy)
|
| 406 |
|
| 407 |
except:
|
| 408 |
print(results_list)
|
|
@@ -456,8 +455,7 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 456 |
|
| 457 |
|
| 458 |
try:
|
| 459 |
-
accuracy = [results['accuracy'] for results in results_list]
|
| 460 |
-
accuracy = sum(accuracy) / len(accuracy)
|
| 461 |
|
| 462 |
except:
|
| 463 |
print(results_list)
|
|
@@ -511,8 +509,7 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 511 |
|
| 512 |
|
| 513 |
try:
|
| 514 |
-
accuracy = [results['accuracy'] for results in results_list]
|
| 515 |
-
accuracy = sum(accuracy) / len(accuracy)
|
| 516 |
|
| 517 |
except:
|
| 518 |
print(results_list)
|
|
@@ -789,8 +786,8 @@ with block:
|
|
| 789 |
with gr.TabItem("Overall"):
|
| 790 |
with gr.Row():
|
| 791 |
gr.components.Dataframe(
|
| 792 |
-
|
| 793 |
-
datatype=["number", "markdown"] + ["number"] * len(
|
| 794 |
type="pandas",
|
| 795 |
)
|
| 796 |
|
|
|
|
| 1 |
+
|
| 2 |
import json
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
+
from statistics import median
|
| 8 |
+
|
| 9 |
|
| 10 |
print("Loading datasets...")
|
| 11 |
|
|
|
|
| 69 |
|
| 70 |
try:
|
| 71 |
overall_acc = [results['overall_acc'] for results in results_list]
|
| 72 |
+
overall_acc = median(overall_acc)
|
| 73 |
|
| 74 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
| 75 |
+
consistency_score_3 = median(consistency_score_3)
|
| 76 |
|
| 77 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
| 78 |
+
AC3_3 = median(AC3_3)
|
| 79 |
|
| 80 |
except:
|
| 81 |
print(results_list)
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
try:
|
| 131 |
+
English = [results['language_acc']['English'] for results in results_list]
|
| 132 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
| 133 |
+
Chinese = [results['language_acc']['Chinese'] for results in results_list]
|
| 134 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
| 135 |
+
Filipino = [results['language_acc']['Filipino'] for results in results_list]
|
| 136 |
+
Spanish = [results['language_acc']['Spanish'] for results in results_list]
|
| 137 |
+
Malay = [results['language_acc']['Malay'] for results in results_list]
|
| 138 |
|
| 139 |
+
English = median(English)
|
| 140 |
+
Vietnamese = median(Vietnamese)
|
| 141 |
+
Chinese = median(Chinese)
|
| 142 |
+
Indonesian = median(Indonesian)
|
| 143 |
+
Filipino = median(Filipino)
|
| 144 |
+
Spanish = median(Spanish)
|
| 145 |
+
Malay = median(Malay)
|
| 146 |
|
| 147 |
|
| 148 |
except:
|
|
|
|
| 210 |
|
| 211 |
try:
|
| 212 |
overall_acc = [results['overall_acc'] for results in results_list]
|
| 213 |
+
overall_acc = median(overall_acc)
|
| 214 |
|
| 215 |
consistency_score_3 = [results['consistency_score_3'] for results in results_list]
|
| 216 |
+
consistency_score_3 = median(consistency_score_3)
|
| 217 |
|
| 218 |
AC3_3 = [results['AC3_3'] for results in results_list]
|
| 219 |
+
AC3_3 = median(AC3_3)
|
| 220 |
|
| 221 |
except:
|
| 222 |
print(results_list)
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
try:
|
| 272 |
+
English = [results['language_acc']['English'] for results in results_list]
|
| 273 |
Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
|
| 274 |
+
Chinese = [results['language_acc']['Chinese'] for results in results_list]
|
| 275 |
Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
|
| 276 |
+
Filipino = [results['language_acc']['Filipino'] for results in results_list]
|
| 277 |
+
Spanish = [results['language_acc']['Spanish'] for results in results_list]
|
| 278 |
+
Malay = [results['language_acc']['Malay'] for results in results_list]
|
| 279 |
|
| 280 |
+
English = median(English)
|
| 281 |
+
Vietnamese = median(Vietnamese)
|
| 282 |
+
Chinese = median(Chinese)
|
| 283 |
+
Indonesian = median(Indonesian)
|
| 284 |
+
Filipino = median(Filipino)
|
| 285 |
+
Spanish = median(Spanish)
|
| 286 |
+
Malay = median(Malay)
|
| 287 |
|
| 288 |
|
| 289 |
except:
|
|
|
|
| 348 |
|
| 349 |
|
| 350 |
try:
|
| 351 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
|
| 352 |
|
| 353 |
except:
|
| 354 |
print(results_list)
|
| 355 |
accuracy = -1
|
| 356 |
|
|
|
|
| 357 |
res = {
|
| 358 |
"Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
|
| 359 |
"Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
|
|
|
|
| 401 |
|
| 402 |
|
| 403 |
try:
|
| 404 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
|
| 405 |
|
| 406 |
except:
|
| 407 |
print(results_list)
|
|
|
|
| 455 |
|
| 456 |
|
| 457 |
try:
|
| 458 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
|
| 459 |
|
| 460 |
except:
|
| 461 |
print(results_list)
|
|
|
|
| 509 |
|
| 510 |
|
| 511 |
try:
|
| 512 |
+
accuracy = median([results['accuracy'] for results in results_list])
|
|
|
|
| 513 |
|
| 514 |
except:
|
| 515 |
print(results_list)
|
|
|
|
| 786 |
with gr.TabItem("Overall"):
|
| 787 |
with gr.Row():
|
| 788 |
gr.components.Dataframe(
|
| 789 |
+
PH_EVAL_FIVE_SHOT,
|
| 790 |
+
datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
|
| 791 |
type="pandas",
|
| 792 |
)
|
| 793 |
|