Spaces:
Running
Running
new
Browse files
app.py
CHANGED
|
@@ -9,10 +9,8 @@ from statistics import median
|
|
| 9 |
|
| 10 |
print("Loading datasets...")
|
| 11 |
|
| 12 |
-
|
| 13 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
| 14 |
|
| 15 |
-
|
| 16 |
def add_rank(df, compute_average=True):
|
| 17 |
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
|
| 18 |
if len(cols_to_rank) == 1:
|
|
@@ -78,7 +76,6 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 78 |
AC3_3 = median(AC3_3)
|
| 79 |
|
| 80 |
except:
|
| 81 |
-
print(results_list)
|
| 82 |
consistency_score_3 = -1
|
| 83 |
overall_acc = -1
|
| 84 |
AC3_3 = -1
|
|
@@ -146,7 +143,6 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 146 |
|
| 147 |
|
| 148 |
except:
|
| 149 |
-
print(results_list)
|
| 150 |
English = -1
|
| 151 |
Vietnamese = -1
|
| 152 |
Chinese = -1
|
|
@@ -219,7 +215,6 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
|
|
| 219 |
AC3_3 = median(AC3_3)
|
| 220 |
|
| 221 |
except:
|
| 222 |
-
print(results_list)
|
| 223 |
consistency_score_3 = -1
|
| 224 |
overall_acc = -1
|
| 225 |
AC3_3 = -1
|
|
@@ -287,7 +282,6 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
|
|
| 287 |
|
| 288 |
|
| 289 |
except:
|
| 290 |
-
print(results_list)
|
| 291 |
English = -1
|
| 292 |
Vietnamese = -1
|
| 293 |
Chinese = -1
|
|
@@ -351,7 +345,6 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 351 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 352 |
|
| 353 |
except:
|
| 354 |
-
print(results_list)
|
| 355 |
accuracy = -1
|
| 356 |
|
| 357 |
res = {
|
|
@@ -404,7 +397,6 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 404 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 405 |
|
| 406 |
except:
|
| 407 |
-
print(results_list)
|
| 408 |
accuracy = -1
|
| 409 |
|
| 410 |
|
|
@@ -458,7 +450,6 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 458 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 459 |
|
| 460 |
except:
|
| 461 |
-
print(results_list)
|
| 462 |
accuracy = -1
|
| 463 |
|
| 464 |
|
|
@@ -512,7 +503,6 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 512 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 513 |
|
| 514 |
except:
|
| 515 |
-
print(results_list)
|
| 516 |
accuracy = -1
|
| 517 |
|
| 518 |
|
|
@@ -566,7 +556,6 @@ def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 566 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 567 |
|
| 568 |
except:
|
| 569 |
-
print(results_list)
|
| 570 |
bleu_score = -1
|
| 571 |
|
| 572 |
|
|
@@ -619,7 +608,6 @@ def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 619 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 620 |
|
| 621 |
except:
|
| 622 |
-
print(results_list)
|
| 623 |
bleu_score = -1
|
| 624 |
|
| 625 |
|
|
@@ -674,7 +662,6 @@ def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 674 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 675 |
|
| 676 |
except:
|
| 677 |
-
print(results_list)
|
| 678 |
bleu_score = -1
|
| 679 |
|
| 680 |
|
|
@@ -727,7 +714,6 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 727 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 728 |
|
| 729 |
except:
|
| 730 |
-
print(results_list)
|
| 731 |
bleu_score = -1
|
| 732 |
|
| 733 |
|
|
@@ -781,7 +767,6 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 781 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 782 |
|
| 783 |
except:
|
| 784 |
-
print(results_list)
|
| 785 |
bleu_score = -1
|
| 786 |
|
| 787 |
|
|
@@ -835,7 +820,6 @@ def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 835 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 836 |
|
| 837 |
except:
|
| 838 |
-
print(results_list)
|
| 839 |
accuracy = -1
|
| 840 |
|
| 841 |
|
|
@@ -890,7 +874,6 @@ def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 890 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 891 |
|
| 892 |
except:
|
| 893 |
-
print(results_list)
|
| 894 |
accuracy = -1
|
| 895 |
|
| 896 |
|
|
@@ -944,7 +927,6 @@ def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 944 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 945 |
|
| 946 |
except:
|
| 947 |
-
print(results_list)
|
| 948 |
accuracy = -1
|
| 949 |
|
| 950 |
|
|
@@ -998,7 +980,6 @@ def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 998 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 999 |
|
| 1000 |
except:
|
| 1001 |
-
print(results_list)
|
| 1002 |
accuracy = -1
|
| 1003 |
|
| 1004 |
|
|
@@ -1054,7 +1035,6 @@ def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1054 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1055 |
|
| 1056 |
except:
|
| 1057 |
-
print(results_list)
|
| 1058 |
accuracy = -1
|
| 1059 |
|
| 1060 |
|
|
@@ -1112,7 +1092,6 @@ def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1112 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1113 |
|
| 1114 |
except:
|
| 1115 |
-
print(results_list)
|
| 1116 |
accuracy = -1
|
| 1117 |
|
| 1118 |
|
|
@@ -1167,7 +1146,6 @@ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1167 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1168 |
|
| 1169 |
except:
|
| 1170 |
-
print(results_list)
|
| 1171 |
accuracy = -1
|
| 1172 |
|
| 1173 |
|
|
@@ -1222,7 +1200,6 @@ def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1222 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1223 |
|
| 1224 |
except:
|
| 1225 |
-
print(results_list)
|
| 1226 |
accuracy = -1
|
| 1227 |
|
| 1228 |
|
|
@@ -1278,7 +1255,6 @@ def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1278 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1279 |
|
| 1280 |
except:
|
| 1281 |
-
print(results_list)
|
| 1282 |
accuracy = -1
|
| 1283 |
|
| 1284 |
|
|
@@ -1333,7 +1309,6 @@ def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1333 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1334 |
|
| 1335 |
except:
|
| 1336 |
-
print(results_list)
|
| 1337 |
accuracy = -1
|
| 1338 |
|
| 1339 |
|
|
@@ -1388,7 +1363,6 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1388 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1389 |
|
| 1390 |
except:
|
| 1391 |
-
print(results_list)
|
| 1392 |
accuracy = -1
|
| 1393 |
|
| 1394 |
|
|
@@ -1445,7 +1419,6 @@ def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1445 |
rougeL = median([results['rougeL'] for results in results_list])
|
| 1446 |
|
| 1447 |
except:
|
| 1448 |
-
print(results_list)
|
| 1449 |
rouge1 = -1
|
| 1450 |
rouge2 = -1
|
| 1451 |
rougeL = -1
|
|
@@ -1505,7 +1478,6 @@ def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1505 |
rougeL = median([results['rougeL'] for results in results_list])
|
| 1506 |
|
| 1507 |
except:
|
| 1508 |
-
print(results_list)
|
| 1509 |
rouge1 = -1
|
| 1510 |
rouge2 = -1
|
| 1511 |
rougeL = -1
|
|
@@ -1565,7 +1537,6 @@ def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1565 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1566 |
|
| 1567 |
except:
|
| 1568 |
-
print(results_list)
|
| 1569 |
accuracy = -1
|
| 1570 |
|
| 1571 |
|
|
@@ -1621,7 +1592,6 @@ def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1621 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1622 |
|
| 1623 |
except:
|
| 1624 |
-
print(results_list)
|
| 1625 |
accuracy = -1
|
| 1626 |
|
| 1627 |
|
|
@@ -1678,7 +1648,6 @@ def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1678 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1679 |
|
| 1680 |
except:
|
| 1681 |
-
print(results_list)
|
| 1682 |
accuracy = -1
|
| 1683 |
|
| 1684 |
|
|
@@ -1735,7 +1704,6 @@ def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1735 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1736 |
|
| 1737 |
except:
|
| 1738 |
-
print(results_list)
|
| 1739 |
accuracy = -1
|
| 1740 |
|
| 1741 |
|
|
@@ -1792,7 +1760,6 @@ def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1792 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1793 |
|
| 1794 |
except:
|
| 1795 |
-
print(results_list)
|
| 1796 |
accuracy = -1
|
| 1797 |
|
| 1798 |
|
|
@@ -1849,7 +1816,6 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1849 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1850 |
|
| 1851 |
except:
|
| 1852 |
-
print(results_list)
|
| 1853 |
accuracy = -1
|
| 1854 |
|
| 1855 |
|
|
@@ -1906,7 +1872,6 @@ def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1906 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1907 |
|
| 1908 |
except:
|
| 1909 |
-
print(results_list)
|
| 1910 |
accuracy = -1
|
| 1911 |
|
| 1912 |
|
|
@@ -1964,7 +1929,6 @@ def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
|
|
| 1964 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1965 |
|
| 1966 |
except:
|
| 1967 |
-
print(results_list)
|
| 1968 |
accuracy = -1
|
| 1969 |
|
| 1970 |
|
|
@@ -3052,15 +3016,6 @@ with block:
|
|
| 3052 |
|
| 3053 |
|
| 3054 |
|
| 3055 |
-
|
| 3056 |
-
|
| 3057 |
-
|
| 3058 |
-
|
| 3059 |
-
|
| 3060 |
-
|
| 3061 |
-
|
| 3062 |
-
|
| 3063 |
-
|
| 3064 |
gr.Markdown(r"""
|
| 3065 |
|
| 3066 |
If this work is useful to you, please citing our work:
|
|
|
|
| 9 |
|
| 10 |
print("Loading datasets...")
|
| 11 |
|
|
|
|
| 12 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
| 13 |
|
|
|
|
| 14 |
def add_rank(df, compute_average=True):
|
| 15 |
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
|
| 16 |
if len(cols_to_rank) == 1:
|
|
|
|
| 76 |
AC3_3 = median(AC3_3)
|
| 77 |
|
| 78 |
except:
|
|
|
|
| 79 |
consistency_score_3 = -1
|
| 80 |
overall_acc = -1
|
| 81 |
AC3_3 = -1
|
|
|
|
| 143 |
|
| 144 |
|
| 145 |
except:
|
|
|
|
| 146 |
English = -1
|
| 147 |
Vietnamese = -1
|
| 148 |
Chinese = -1
|
|
|
|
| 215 |
AC3_3 = median(AC3_3)
|
| 216 |
|
| 217 |
except:
|
|
|
|
| 218 |
consistency_score_3 = -1
|
| 219 |
overall_acc = -1
|
| 220 |
AC3_3 = -1
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
except:
|
|
|
|
| 285 |
English = -1
|
| 286 |
Vietnamese = -1
|
| 287 |
Chinese = -1
|
|
|
|
| 345 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 346 |
|
| 347 |
except:
|
|
|
|
| 348 |
accuracy = -1
|
| 349 |
|
| 350 |
res = {
|
|
|
|
| 397 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 398 |
|
| 399 |
except:
|
|
|
|
| 400 |
accuracy = -1
|
| 401 |
|
| 402 |
|
|
|
|
| 450 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 451 |
|
| 452 |
except:
|
|
|
|
| 453 |
accuracy = -1
|
| 454 |
|
| 455 |
|
|
|
|
| 503 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 504 |
|
| 505 |
except:
|
|
|
|
| 506 |
accuracy = -1
|
| 507 |
|
| 508 |
|
|
|
|
| 556 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 557 |
|
| 558 |
except:
|
|
|
|
| 559 |
bleu_score = -1
|
| 560 |
|
| 561 |
|
|
|
|
| 608 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 609 |
|
| 610 |
except:
|
|
|
|
| 611 |
bleu_score = -1
|
| 612 |
|
| 613 |
|
|
|
|
| 662 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 663 |
|
| 664 |
except:
|
|
|
|
| 665 |
bleu_score = -1
|
| 666 |
|
| 667 |
|
|
|
|
| 714 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 715 |
|
| 716 |
except:
|
|
|
|
| 717 |
bleu_score = -1
|
| 718 |
|
| 719 |
|
|
|
|
| 767 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
| 768 |
|
| 769 |
except:
|
|
|
|
| 770 |
bleu_score = -1
|
| 771 |
|
| 772 |
|
|
|
|
| 820 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 821 |
|
| 822 |
except:
|
|
|
|
| 823 |
accuracy = -1
|
| 824 |
|
| 825 |
|
|
|
|
| 874 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 875 |
|
| 876 |
except:
|
|
|
|
| 877 |
accuracy = -1
|
| 878 |
|
| 879 |
|
|
|
|
| 927 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 928 |
|
| 929 |
except:
|
|
|
|
| 930 |
accuracy = -1
|
| 931 |
|
| 932 |
|
|
|
|
| 980 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 981 |
|
| 982 |
except:
|
|
|
|
| 983 |
accuracy = -1
|
| 984 |
|
| 985 |
|
|
|
|
| 1035 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1036 |
|
| 1037 |
except:
|
|
|
|
| 1038 |
accuracy = -1
|
| 1039 |
|
| 1040 |
|
|
|
|
| 1092 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1093 |
|
| 1094 |
except:
|
|
|
|
| 1095 |
accuracy = -1
|
| 1096 |
|
| 1097 |
|
|
|
|
| 1146 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1147 |
|
| 1148 |
except:
|
|
|
|
| 1149 |
accuracy = -1
|
| 1150 |
|
| 1151 |
|
|
|
|
| 1200 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1201 |
|
| 1202 |
except:
|
|
|
|
| 1203 |
accuracy = -1
|
| 1204 |
|
| 1205 |
|
|
|
|
| 1255 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1256 |
|
| 1257 |
except:
|
|
|
|
| 1258 |
accuracy = -1
|
| 1259 |
|
| 1260 |
|
|
|
|
| 1309 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1310 |
|
| 1311 |
except:
|
|
|
|
| 1312 |
accuracy = -1
|
| 1313 |
|
| 1314 |
|
|
|
|
| 1363 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1364 |
|
| 1365 |
except:
|
|
|
|
| 1366 |
accuracy = -1
|
| 1367 |
|
| 1368 |
|
|
|
|
| 1419 |
rougeL = median([results['rougeL'] for results in results_list])
|
| 1420 |
|
| 1421 |
except:
|
|
|
|
| 1422 |
rouge1 = -1
|
| 1423 |
rouge2 = -1
|
| 1424 |
rougeL = -1
|
|
|
|
| 1478 |
rougeL = median([results['rougeL'] for results in results_list])
|
| 1479 |
|
| 1480 |
except:
|
|
|
|
| 1481 |
rouge1 = -1
|
| 1482 |
rouge2 = -1
|
| 1483 |
rougeL = -1
|
|
|
|
| 1537 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1538 |
|
| 1539 |
except:
|
|
|
|
| 1540 |
accuracy = -1
|
| 1541 |
|
| 1542 |
|
|
|
|
| 1592 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1593 |
|
| 1594 |
except:
|
|
|
|
| 1595 |
accuracy = -1
|
| 1596 |
|
| 1597 |
|
|
|
|
| 1648 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1649 |
|
| 1650 |
except:
|
|
|
|
| 1651 |
accuracy = -1
|
| 1652 |
|
| 1653 |
|
|
|
|
| 1704 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1705 |
|
| 1706 |
except:
|
|
|
|
| 1707 |
accuracy = -1
|
| 1708 |
|
| 1709 |
|
|
|
|
| 1760 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1761 |
|
| 1762 |
except:
|
|
|
|
| 1763 |
accuracy = -1
|
| 1764 |
|
| 1765 |
|
|
|
|
| 1816 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1817 |
|
| 1818 |
except:
|
|
|
|
| 1819 |
accuracy = -1
|
| 1820 |
|
| 1821 |
|
|
|
|
| 1872 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1873 |
|
| 1874 |
except:
|
|
|
|
| 1875 |
accuracy = -1
|
| 1876 |
|
| 1877 |
|
|
|
|
| 1929 |
accuracy = median([results['accuracy'] for results in results_list])
|
| 1930 |
|
| 1931 |
except:
|
|
|
|
| 1932 |
accuracy = -1
|
| 1933 |
|
| 1934 |
|
|
|
|
| 3016 |
|
| 3017 |
|
| 3018 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3019 |
gr.Markdown(r"""
|
| 3020 |
|
| 3021 |
If this work is useful to you, please citing our work:
|