Muennighoff commited on
Commit
c22587e
·
1 Parent(s): b9a33e8
Files changed (42) hide show
  1. perplexity/evaluation/generation/merged.csv +53 -0
  2. perplexity/evaluation/generation/merged.json +1 -0
  3. perplexity/evaluation/rankeval/perplexity_0.csv +21 -0
  4. perplexity/evaluation/rankeval/perplexity_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json +0 -87
  5. perplexity/evaluation/rankeval/perplexity_1.csv +21 -0
  6. perplexity/evaluation/rankeval/perplexity_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json +0 -87
  7. perplexity/evaluation/rankeval/perplexity_2.csv +21 -0
  8. perplexity/evaluation/rankeval/perplexity_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json +0 -87
  9. perplexity/evaluation/rankeval/perplexity_3.csv +21 -0
  10. perplexity/evaluation/rankeval/perplexity_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json +0 -87
  11. perplexity/evaluation/rankeval/perplexity_4.csv +21 -0
  12. perplexity/evaluation/rankeval/perplexity_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json +0 -87
  13. perplexity/evaluation/rankeval/perplexity_5.csv +21 -0
  14. perplexity/evaluation/rankeval/perplexity_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json +0 -87
  15. perplexity25/evaluation/generation/merged.csv +53 -0
  16. perplexity25/evaluation/generation/merged.json +1 -0
  17. perplexity25/evaluation/rankeval/perplexity25_0.csv +21 -0
  18. perplexity25/evaluation/rankeval/perplexity25_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json +0 -87
  19. perplexity25/evaluation/rankeval/perplexity25_1.csv +21 -0
  20. perplexity25/evaluation/rankeval/perplexity25_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json +0 -87
  21. perplexity25/evaluation/rankeval/perplexity25_2.csv +21 -0
  22. perplexity25/evaluation/rankeval/perplexity25_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json +0 -87
  23. perplexity25/evaluation/rankeval/perplexity25_3.csv +21 -0
  24. perplexity25/evaluation/rankeval/perplexity25_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json +0 -87
  25. perplexity25/evaluation/rankeval/perplexity25_4.csv +21 -0
  26. perplexity25/evaluation/rankeval/perplexity25_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json +0 -87
  27. perplexity25/evaluation/rankeval/perplexity25_5.csv +21 -0
  28. perplexity25/evaluation/rankeval/perplexity25_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json +0 -87
  29. perplexity50/evaluation/generation/merged.csv +53 -0
  30. perplexity50/evaluation/generation/merged.json +1 -0
  31. perplexity50/evaluation/rankeval/perplexity50_0.csv +21 -0
  32. perplexity50/evaluation/rankeval/perplexity50_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json +0 -87
  33. perplexity50/evaluation/rankeval/perplexity50_1.csv +21 -0
  34. perplexity50/evaluation/rankeval/perplexity50_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json +0 -87
  35. perplexity50/evaluation/rankeval/perplexity50_2.csv +21 -0
  36. perplexity50/evaluation/rankeval/perplexity50_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json +0 -87
  37. perplexity50/evaluation/rankeval/perplexity50_3.csv +21 -0
  38. perplexity50/evaluation/rankeval/perplexity50_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json +0 -87
  39. perplexity50/evaluation/rankeval/perplexity50_4.csv +21 -0
  40. perplexity50/evaluation/rankeval/perplexity50_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json +0 -87
  41. perplexity50/evaluation/rankeval/perplexity50_5.csv +21 -0
  42. perplexity50/evaluation/rankeval/perplexity50_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json +0 -87
perplexity/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.022292985524574444
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.022292985524574444
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1995055031643681
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1995055031643681
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2403073697075928
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2403073697075928
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24067531660097596
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24067531660097596
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2283664329607444
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2283664329607444
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.22088952705630202
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.22088952705630202
14
+ e2e_nlg_cleaned,5,average,multiple,0.19200618916909296
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05432411992031246
16
+ gem_xsum,0,median,rouge2_fmeasure,0.05432411992031246
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0333977078622104
18
+ gem_xsum,1,median,rouge2_fmeasure,0.0333977078622104
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04006840275451415
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04006840275451415
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03850524479452543
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03850524479452543
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010989532389756859
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010989532389756859
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005154358964062439
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0005154358964062439
27
+ gem_xsum,5,average,multiple,0.029633407269620923
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.046598784604859486
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.046598784604859486
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050065381446004406
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.050065381446004406
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05299300887318277
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05299300887318277
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05346240993388053
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05346240993388053
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05523835531458514
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05523835531458514
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05619659179665226
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05619659179665226
40
+ web_nlg_en,5,average,multiple,0.0524257553281941
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03434823579340285
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03434823579340285
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05439999185109123
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05439999185109123
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.055330208196910265
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.055330208196910265
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.046193370406017856
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.046193370406017856
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013946881394381343
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013946881394381343
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002093015608191419
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002093015608191419
53
+ wiki_lingua_en,5,average,multiple,0.03438528387499916
perplexity/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.30438087878461806, "bleu_stderr": 0.027985398209556842, "rouge1_fmeasure": 0.10072645925520833, "rouge1_fmeasure_stderr": 0.001803334386023207, "rouge1_precision": 0.06571872727613429, "rouge1_precision_stderr": 0.0014178747302171335, "rouge1_recall": 0.28999616131453776, "rouge1_recall_stderr": 0.00434042822819289, "rouge2_fmeasure": 0.046598784604859486, "rouge2_fmeasure_stderr": 0.0011368379760255077, "rouge2_precision": 0.030429296094824455, "rouge2_precision_stderr": 0.0008978277172003451, "rouge2_recall": 0.13772743526874254, "rouge2_recall_stderr": 0.0029713701289473594, "rougeL_fmeasure": 0.09804723740892954, "rougeL_fmeasure_stderr": 0.001738122594593388, "rougeL_precision": 0.06388836453568124, "rougeL_precision_stderr": 0.0013632881297082904, "rougeL_recall": 0.28363499948165843, "rougeL_recall_stderr": 0.004278943131293478, "rougeLsum_fmeasure": 0.09688278752987489, "rougeLsum_fmeasure_stderr": 0.0017265830325622752, "rougeLsum_precision": 0.06323531180120187, "rougeLsum_precision_stderr": 0.0013639142517225444, "rougeLsum_recall": 0.2788262564249966, "rougeLsum_recall_stderr": 0.004155727073631924}}, "1": {"PALM_prompt": {"bleu": 0.3677620101657523, "bleu_stderr": 0.02209751845324868, "rouge1_fmeasure": 0.10743506199331603, "rouge1_fmeasure_stderr": 0.0017479884253783432, "rouge1_precision": 0.06882276794192525, "rouge1_precision_stderr": 0.001269726775890253, "rouge1_recall": 0.3358838633226288, "rouge1_recall_stderr": 0.004944830474767479, "rouge2_fmeasure": 0.050065381446004406, "rouge2_fmeasure_stderr": 0.0011139596034513608, "rouge2_precision": 0.03198910527175372, "rouge2_precision_stderr": 0.0007904311483238523, "rouge2_recall": 0.1639727133230724, "rouge2_recall_stderr": 0.0035115320297904157, "rougeL_fmeasure": 0.10330309525862182, "rougeL_fmeasure_stderr": 0.0016709978496468228, "rougeL_precision": 0.06619223938605175, "rougeL_precision_stderr": 0.001213536084420925, "rougeL_recall": 0.3209215286944476, "rougeL_recall_stderr": 0.004642641539916205, "rougeLsum_fmeasure": 0.102932245838326, "rougeLsum_fmeasure_stderr": 0.0016665533221243655, "rougeLsum_precision": 0.06596557560336991, "rougeLsum_precision_stderr": 0.0012138292029559524, "rougeLsum_recall": 0.32099921969419465, "rougeLsum_recall_stderr": 0.004628086148730377}}, "2": {"PALM_prompt": {"bleu": 0.47923990514244874, "bleu_stderr": 0.03166485764907936, "rouge1_fmeasure": 0.11370260047327577, "rouge1_fmeasure_stderr": 0.0017046442313663663, "rouge1_precision": 0.07204383319731683, "rouge1_precision_stderr": 0.0012155881365152256, "rouge1_recall": 0.36672271781887156, "rouge1_recall_stderr": 0.00504889217499601, "rouge2_fmeasure": 0.05299300887318277, "rouge2_fmeasure_stderr": 0.0010947490193883183, "rouge2_precision": 0.03340955873075625, "rouge2_precision_stderr": 0.0007555261485064188, "rouge2_recall": 0.18177633862979822, "rouge2_recall_stderr": 0.0037021276711014387, "rougeL_fmeasure": 0.1079044075881634, "rougeL_fmeasure_stderr": 0.0015944656814960549, "rougeL_precision": 0.0684099274474328, "rougeL_precision_stderr": 0.00113914940883586, "rougeL_recall": 0.3455577571113544, "rougeL_recall_stderr": 0.004627024307619622, "rougeLsum_fmeasure": 0.10867383319593066, "rougeLsum_fmeasure_stderr": 0.0016268811944244747, "rougeLsum_precision": 0.06889283297975032, "rougeLsum_precision_stderr": 0.0011627655720725584, "rougeLsum_recall": 0.3493606357170359, "rougeLsum_recall_stderr": 0.0047163037850670355}}, "3": {"PALM_prompt": {"bleu": 0.5027920774081681, "bleu_stderr": 0.030462214372567638, "rouge1_fmeasure": 0.1146798040803432, "rouge1_fmeasure_stderr": 0.0016732637014203531, "rouge1_precision": 0.07255116336657369, "rouge1_precision_stderr": 0.0011977538039791803, "rouge1_recall": 0.38345627514615743, "rouge1_recall_stderr": 0.005203246900271579, "rouge2_fmeasure": 0.05346240993388053, "rouge2_fmeasure_stderr": 0.00108055575928216, "rouge2_precision": 0.03363690204181306, "rouge2_precision_stderr": 0.0007452192351280848, "rouge2_recall": 0.1918451527107662, "rouge2_recall_stderr": 0.0038340869615008937, "rougeL_fmeasure": 0.10795694469281018, "rougeL_fmeasure_stderr": 0.0015660316693302382, "rougeL_precision": 0.06838424060537425, "rougeL_precision_stderr": 0.0011261261420472187, "rougeL_recall": 0.35815409259444997, "rougeL_recall_stderr": 0.004736318677371466, "rougeLsum_fmeasure": 0.10928613566777898, "rougeLsum_fmeasure_stderr": 0.0015925978011445418, "rougeLsum_precision": 0.06920483129615293, "rougeLsum_precision_stderr": 0.0011443226072995456, "rougeLsum_recall": 0.36357801556888564, "rougeLsum_recall_stderr": 0.004808733759405405}}, "4": {"PALM_prompt": {"bleu": 0.5831555359984453, "bleu_stderr": 0.032234659529391536, "rouge1_fmeasure": 0.11750008800672353, "rouge1_fmeasure_stderr": 0.0016477505264978866, "rouge1_precision": 0.07392661038451319, "rouge1_precision_stderr": 0.00117243887617427, "rouge1_recall": 0.39335666383917167, "rouge1_recall_stderr": 0.005142725185391461, "rouge2_fmeasure": 0.05523835531458514, "rouge2_fmeasure_stderr": 0.0010698113777651895, "rouge2_precision": 0.03448655846022897, "rouge2_precision_stderr": 0.000729028189759837, "rouge2_recall": 0.20020642249358464, "rouge2_recall_stderr": 0.0038687044782014456, "rougeL_fmeasure": 0.1101991356020512, "rougeL_fmeasure_stderr": 0.0015236328659278776, "rougeL_precision": 0.06941307599352699, "rougeL_precision_stderr": 0.0010899673683375645, "rougeL_recall": 0.3670211800134988, "rougeL_recall_stderr": 0.004700921629116034, "rougeLsum_fmeasure": 0.1117873702364502, "rougeLsum_fmeasure_stderr": 0.0015580968509773768, "rougeLsum_precision": 0.07037653651757456, "rougeLsum_precision_stderr": 0.0011110620728147437, "rougeLsum_recall": 0.3732246975137694, "rougeLsum_recall_stderr": 0.004797544529790582}}, "5": {"PALM_prompt": {"bleu": 0.6365749188582466, "bleu_stderr": 0.039638813172299045, "rouge1_fmeasure": 0.11886210839837029, "rouge1_fmeasure_stderr": 0.001641331021319136, "rouge1_precision": 0.07471557189793104, "rouge1_precision_stderr": 0.001168900413573336, "rouge1_recall": 0.40490074663891484, "rouge1_recall_stderr": 0.005284549026027036, "rouge2_fmeasure": 0.05619659179665226, "rouge2_fmeasure_stderr": 0.0010637830284484662, "rouge2_precision": 0.0350998165096206, "rouge2_precision_stderr": 0.0007323935879922631, "rouge2_recall": 0.20683097072034598, "rouge2_recall_stderr": 0.003912349890143833, "rougeL_fmeasure": 0.10996565063664662, "rougeL_fmeasure_stderr": 0.0015072851492445695, "rougeL_precision": 0.06926051144904892, "rougeL_precision_stderr": 0.0010863629617073184, "rougeL_recall": 0.37213644980404864, "rougeL_recall_stderr": 0.004688196830703624, "rougeLsum_fmeasure": 0.11299990359140699, "rougeLsum_fmeasure_stderr": 0.001561286385930656, "rougeLsum_precision": 0.07111141425693926, "rougeLsum_precision_stderr": 0.0011190880115330949, "rougeLsum_recall": 0.38316893134013263, "rougeLsum_recall_stderr": 0.004892613100508629}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5579291268345696, "bleu_stderr": 0.06016300187830401, "rouge1_fmeasure": 0.17406988949643304, "rouge1_fmeasure_stderr": 0.0018339930055680368, "rouge1_precision": 0.14888065220459862, "rouge1_precision_stderr": 0.0018522381431221608, "rouge1_recall": 0.2533502341723187, "rouge1_recall_stderr": 0.002694764028887335, "rouge2_fmeasure": 0.03434823579340285, "rouge2_fmeasure_stderr": 0.0008465750128095288, "rouge2_precision": 0.029169878872080838, "rouge2_precision_stderr": 0.0007679053115951254, "rouge2_recall": 0.05196743771903658, "rouge2_recall_stderr": 0.001421253503809284, "rougeL_fmeasure": 0.13456472416719728, "rougeL_fmeasure_stderr": 0.0013210328093623985, "rougeL_precision": 0.1138889494347587, "rougeL_precision_stderr": 0.0013014428116691651, "rougeL_recall": 0.20025749658243655, "rougeL_recall_stderr": 0.0021830666467128335, "rougeLsum_fmeasure": 0.16115176655656072, "rougeLsum_fmeasure_stderr": 0.0016863623652902142, "rougeLsum_precision": 0.13762861506852897, "rougeLsum_precision_stderr": 0.001700786002748469, "rougeLsum_recall": 0.23544002194089206, "rougeLsum_recall_stderr": 0.002515653712672603}}, "1": {"tldr_en": {"bleu": 2.8504691019972226, "bleu_stderr": 0.07529997127562747, "rouge1_fmeasure": 0.21615787975223222, "rouge1_fmeasure_stderr": 0.001980052888687319, "rouge1_precision": 0.18628138897984436, "rouge1_precision_stderr": 0.0021004152459147747, "rouge1_recall": 0.3129007187114192, "rouge1_recall_stderr": 0.0029032182486249418, "rouge2_fmeasure": 0.05439999185109123, "rouge2_fmeasure_stderr": 0.001031487571403725, "rouge2_precision": 0.046486537780770956, "rouge2_precision_stderr": 0.0009461054630336336, "rouge2_recall": 0.08171808437787392, "rouge2_recall_stderr": 0.001737593596442716, "rougeL_fmeasure": 0.15690618601267947, "rougeL_fmeasure_stderr": 0.0013436606219927435, "rougeL_precision": 0.13401183830304364, "rougeL_precision_stderr": 0.00140969758282127, "rougeL_recall": 0.23246416864486025, "rougeL_recall_stderr": 0.0022971477521400944, "rougeLsum_fmeasure": 0.20269374476846683, "rougeLsum_fmeasure_stderr": 0.0018536229956474099, "rougeLsum_precision": 0.1743604474408626, "rougeLsum_precision_stderr": 0.001958253968449097, "rougeLsum_recall": 0.2946260029722949, "rougeLsum_recall_stderr": 0.0027768807102536105}}, "2": {"tldr_en": {"bleu": 3.127885213179027, "bleu_stderr": 0.05567768230878822, "rouge1_fmeasure": 0.21248721018427344, "rouge1_fmeasure_stderr": 0.002005377613049306, "rouge1_precision": 0.19145535496087807, "rouge1_precision_stderr": 0.002296451033052049, "rouge1_recall": 0.2975680142091921, "rouge1_recall_stderr": 0.002931688880420891, "rouge2_fmeasure": 0.055330208196910265, "rouge2_fmeasure_stderr": 0.0010696008344144292, "rouge2_precision": 0.05003016577994891, "rouge2_precision_stderr": 0.0011239070261388475, "rouge2_recall": 0.08044311392175647, "rouge2_recall_stderr": 0.001763848842609043, "rougeL_fmeasure": 0.1613056472114288, "rougeL_fmeasure_stderr": 0.0014427201039189429, "rougeL_precision": 0.14417467914561882, "rougeL_precision_stderr": 0.0016764997188872376, "rougeL_recall": 0.23103867510190504, "rougeL_recall_stderr": 0.002419663302485, "rougeLsum_fmeasure": 0.19917102006805654, "rougeLsum_fmeasure_stderr": 0.0018806834912790124, "rougeLsum_precision": 0.17923916059988207, "rougeLsum_precision_stderr": 0.002160531408137847, "rougeLsum_recall": 0.27981692016794185, "rougeLsum_recall_stderr": 0.002789997688790343}}, "3": {"tldr_en": {"bleu": 3.0807771429420754, "bleu_stderr": 0.09283762605369977, "rouge1_fmeasure": 0.1775686933392586, "rouge1_fmeasure_stderr": 0.0023132433314061317, "rouge1_precision": 0.17616795901868026, "rouge1_precision_stderr": 0.0029041735231821264, "rouge1_recall": 0.2398365427132762, "rouge1_recall_stderr": 0.003303934643917804, "rouge2_fmeasure": 0.046193370406017856, "rouge2_fmeasure_stderr": 0.0010778055491266162, "rouge2_precision": 0.046939802020617566, "rouge2_precision_stderr": 0.0014687470735497206, "rouge2_recall": 0.06422548105107985, "rouge2_recall_stderr": 0.0016475717099470786, "rougeL_fmeasure": 0.135715301519862, "rougeL_fmeasure_stderr": 0.0017310055665623648, "rougeL_precision": 0.13481464964787096, "rougeL_precision_stderr": 0.0022897360632217266, "rougeL_recall": 0.18678109287740904, "rougeL_recall_stderr": 0.0026738452831759677, "rougeLsum_fmeasure": 0.1672145914904344, "rougeLsum_fmeasure_stderr": 0.0021753093573102073, "rougeLsum_precision": 0.1658688389623912, "rougeLsum_precision_stderr": 0.002743944725824659, "rougeLsum_recall": 0.22654605759305751, "rougeLsum_recall_stderr": 0.0031428598277916196}}, "4": {"tldr_en": {"bleu": 0.5293847081914334, "bleu_stderr": 0.044698835511384254, "rouge1_fmeasure": 0.05573280124002876, "rouge1_fmeasure_stderr": 0.0019136745631413645, "rouge1_precision": 0.0584713690765183, "rouge1_precision_stderr": 0.0023318063534537165, "rouge1_recall": 0.07718352458545524, "rouge1_recall_stderr": 0.002735992768065453, "rouge2_fmeasure": 0.013946881394381343, "rouge2_fmeasure_stderr": 0.0007038540032034839, "rouge2_precision": 0.01500160633710272, "rouge2_precision_stderr": 0.0009825454465574395, "rouge2_recall": 0.02039516990198183, "rouge2_recall_stderr": 0.001139121615096021, "rougeL_fmeasure": 0.04301362739459154, "rougeL_fmeasure_stderr": 0.0014516622611586488, "rougeL_precision": 0.04558030719768415, "rougeL_precision_stderr": 0.0018674171866451237, "rougeL_recall": 0.06093944803992283, "rougeL_recall_stderr": 0.0022091397429612157, "rougeLsum_fmeasure": 0.052282734877528686, "rougeLsum_fmeasure_stderr": 0.0017926957831151907, "rougeLsum_precision": 0.05496664162339461, "rougeLsum_precision_stderr": 0.0022050347910565274, "rougeLsum_recall": 0.0725975299250868, "rougeLsum_recall_stderr": 0.0025818847671325463}}, "5": {"tldr_en": {"bleu": 1.9499255251335006e-07, "bleu_stderr": 3.3145291785030997e-07, "rouge1_fmeasure": 0.008623575125031323, "rouge1_fmeasure_stderr": 0.0008288612961403041, "rouge1_precision": 0.009305181256700445, "rouge1_precision_stderr": 0.0010395448742730087, "rouge1_recall": 0.012223369402839458, "rouge1_recall_stderr": 0.0012274307651446278, "rouge2_fmeasure": 0.002093015608191419, "rouge2_fmeasure_stderr": 0.00027758841441824164, "rouge2_precision": 0.0025435206455146683, "rouge2_precision_stderr": 0.0005383342373842493, "rouge2_recall": 0.0031006973107968873, "rouge2_recall_stderr": 0.0004472222802474402, "rougeL_fmeasure": 0.006710465163509879, "rougeL_fmeasure_stderr": 0.0006498829914466451, "rougeL_precision": 0.007394254420247825, "rougeL_precision_stderr": 0.000882581070383217, "rougeL_recall": 0.009645269299470693, "rougeL_recall_stderr": 0.0009987275588991668, "rougeLsum_fmeasure": 0.008136271016090088, "rougeLsum_fmeasure_stderr": 0.0007837539033952645, "rougeLsum_precision": 0.008824151605454963, "rougeLsum_precision_stderr": 0.0010021873301601862, "rougeLsum_recall": 0.011594578158050393, "rougeLsum_recall_stderr": 0.0011725427133673896}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.82061032660784, "bleu_stderr": 0.07096469051250175, "rouge1_fmeasure": 0.15825083088882455, "rouge1_fmeasure_stderr": 0.0015057952646897993, "rouge1_precision": 0.222486766832112, "rouge1_precision_stderr": 0.0033842158110189277, "rouge1_recall": 0.1790689833398624, "rouge1_recall_stderr": 0.0018970549220355723, "rouge2_fmeasure": 0.022292985524574444, "rouge2_fmeasure_stderr": 0.0006610427910211585, "rouge2_precision": 0.021935233358876982, "rouge2_precision_stderr": 0.0008022922500453108, "rouge2_recall": 0.031732352695645706, "rouge2_recall_stderr": 0.0009741385489925919, "rougeL_fmeasure": 0.1495036471572004, "rougeL_fmeasure_stderr": 0.0013950962210181237, "rougeL_precision": 0.20561351860512828, "rougeL_precision_stderr": 0.0030700557880713867, "rougeL_recall": 0.17239931551868604, "rougeL_recall_stderr": 0.001867012935877257, "rougeLsum_fmeasure": 0.136862319950687, "rougeLsum_fmeasure_stderr": 0.0014225741473040676, "rougeLsum_precision": 0.19805184360305841, "rougeLsum_precision_stderr": 0.0032227599715627343, "rougeLsum_recall": 0.1509451752386678, "rougeLsum_recall_stderr": 0.0016062725531267121}}, "1": {"generate_text_restaurant": {"bleu": 11.657037239702142, "bleu_stderr": 0.1250980356694754, "rouge1_fmeasure": 0.4389348120897716, "rouge1_fmeasure_stderr": 0.002063579737266741, "rouge1_precision": 0.45968566711122216, "rouge1_precision_stderr": 0.002881285091661702, "rouge1_recall": 0.46542006430779354, "rouge1_recall_stderr": 0.0028779976054145785, "rouge2_fmeasure": 0.1995055031643681, "rouge2_fmeasure_stderr": 0.0017838055449605266, "rouge2_precision": 0.21038656753149504, "rouge2_precision_stderr": 0.002167887993557998, "rouge2_recall": 0.21239637096024527, "rouge2_recall_stderr": 0.0021227522877282565, "rougeL_fmeasure": 0.3114176922977353, "rougeL_fmeasure_stderr": 0.0017952929940337735, "rougeL_precision": 0.3268614579954877, "rougeL_precision_stderr": 0.002404585448478252, "rougeL_recall": 0.3303654420846652, "rougeL_recall_stderr": 0.002362681762163533, "rougeLsum_fmeasure": 0.3620752417019191, "rougeLsum_fmeasure_stderr": 0.002056079671135789, "rougeLsum_precision": 0.3793916638196591, "rougeLsum_precision_stderr": 0.002677199223367296, "rougeLsum_recall": 0.3835618439741999, "rougeLsum_recall_stderr": 0.002671098460743732}}, "2": {"generate_text_restaurant": {"bleu": 14.339702198028428, "bleu_stderr": 0.13350232957464767, "rouge1_fmeasure": 0.4798241859576156, "rouge1_fmeasure_stderr": 0.0022710941605897325, "rouge1_precision": 0.5570264884781344, "rouge1_precision_stderr": 0.0033490944418358374, "rouge1_recall": 0.46432809954638865, "rouge1_recall_stderr": 0.002945932627099228, "rouge2_fmeasure": 0.2403073697075928, "rouge2_fmeasure_stderr": 0.002087685451857004, "rouge2_precision": 0.28313861767317927, "rouge2_precision_stderr": 0.0027952153389596843, "rouge2_recall": 0.2325744710533778, "rouge2_recall_stderr": 0.0022998468012822644, "rougeL_fmeasure": 0.35446236973706474, "rougeL_fmeasure_stderr": 0.0021166068311389095, "rougeL_precision": 0.4135082878473266, "rougeL_precision_stderr": 0.003050078856711996, "rougeL_recall": 0.34252272942973466, "rougeL_recall_stderr": 0.002524272588321217, "rougeLsum_fmeasure": 0.40022368564912997, "rougeLsum_fmeasure_stderr": 0.00229931320451958, "rougeLsum_precision": 0.46552847624479543, "rougeLsum_precision_stderr": 0.0032616657748971998, "rougeLsum_recall": 0.3869495647860439, "rougeLsum_recall_stderr": 0.002764146985697145}}, "3": {"generate_text_restaurant": {"bleu": 14.50537794842067, "bleu_stderr": 0.11584363484363577, "rouge1_fmeasure": 0.4791848298362283, "rouge1_fmeasure_stderr": 0.0022007720607771426, "rouge1_precision": 0.5402112307108176, "rouge1_precision_stderr": 0.003295524595806651, "rouge1_recall": 0.47732917178018114, "rouge1_recall_stderr": 0.00294022910982499, "rouge2_fmeasure": 0.24067531660097596, "rouge2_fmeasure_stderr": 0.002087534005568858, "rouge2_precision": 0.2750068424415456, "rouge2_precision_stderr": 0.002742749101788008, "rouge2_recall": 0.239901278700609, "rouge2_recall_stderr": 0.0023557789869618896, "rougeL_fmeasure": 0.35365228129689674, "rougeL_fmeasure_stderr": 0.00209824048462177, "rougeL_precision": 0.40045561756040954, "rougeL_precision_stderr": 0.002992395353090302, "rougeL_recall": 0.35170929339590973, "rougeL_recall_stderr": 0.0025447548206976338, "rougeLsum_fmeasure": 0.40103023450016195, "rougeLsum_fmeasure_stderr": 0.002275399298911374, "rougeLsum_precision": 0.45223409928062946, "rougeLsum_precision_stderr": 0.0031770144742772127, "rougeLsum_recall": 0.3996467826064061, "rougeLsum_recall_stderr": 0.002820396617171887}}, "4": {"generate_text_restaurant": {"bleu": 13.323972252459379, "bleu_stderr": 0.16742143852884073, "rouge1_fmeasure": 0.46396091436304504, "rouge1_fmeasure_stderr": 0.0021581635430039958, "rouge1_precision": 0.4899803542811925, "rouge1_precision_stderr": 0.003129903470372565, "rouge1_recall": 0.48706786873461216, "rouge1_recall_stderr": 0.002870795532573222, "rouge2_fmeasure": 0.2283664329607444, "rouge2_fmeasure_stderr": 0.0020126317997385106, "rouge2_precision": 0.24359351406261168, "rouge2_precision_stderr": 0.0025312411079664366, "rouge2_recall": 0.24067281188228995, "rouge2_recall_stderr": 0.0023396690126969532, "rougeL_fmeasure": 0.3421085000388542, "rougeL_fmeasure_stderr": 0.002013555794275305, "rougeL_precision": 0.3624069982870762, "rougeL_precision_stderr": 0.0027711033582985124, "rougeL_recall": 0.3591682726211815, "rougeL_recall_stderr": 0.0025140239206322313, "rougeLsum_fmeasure": 0.3900177854874722, "rougeLsum_fmeasure_stderr": 0.0022269574328580566, "rougeLsum_precision": 0.4114748595185199, "rougeLsum_precision_stderr": 0.0029677058156554, "rougeLsum_recall": 0.4100795333879036, "rougeLsum_recall_stderr": 0.002820033316291732}}, "5": {"generate_text_restaurant": {"bleu": 12.462777642040733, "bleu_stderr": 0.13551486783004044, "rouge1_fmeasure": 0.4559772721525502, "rouge1_fmeasure_stderr": 0.0020283797559365376, "rouge1_precision": 0.45865129639331476, "rouge1_precision_stderr": 0.002686570307588201, "rouge1_recall": 0.4938245489163575, "rouge1_recall_stderr": 0.0027958101448835645, "rouge2_fmeasure": 0.22088952705630202, "rouge2_fmeasure_stderr": 0.0018905153937119456, "rouge2_precision": 0.22301236793840634, "rouge2_precision_stderr": 0.002160872250109803, "rouge2_recall": 0.24081053155333462, "rouge2_recall_stderr": 0.0022973862129338496, "rougeL_fmeasure": 0.3335077717084848, "rougeL_fmeasure_stderr": 0.0018782606369312164, "rougeL_precision": 0.3354962184605829, "rougeL_precision_stderr": 0.0023253516829045725, "rougeL_recall": 0.3619962097662624, "rougeL_recall_stderr": 0.002476618191262458, "rougeLsum_fmeasure": 0.38415093197988986, "rougeLsum_fmeasure_stderr": 0.00209136460819391, "rougeLsum_precision": 0.3861502626280554, "rougeLsum_precision_stderr": 0.0025748401786830784, "rougeLsum_recall": 0.416513089237172, "rougeLsum_recall_stderr": 0.002733652456992395}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.282427388603868, "bleu_stderr": 0.12070696540892288, "rouge1_fmeasure": 0.21547497313031236, "rouge1_fmeasure_stderr": 0.002589604938536055, "rouge1_precision": 0.15955334897495682, "rouge1_precision_stderr": 0.002209067557750088, "rouge1_recall": 0.36217161593458225, "rouge1_recall_stderr": 0.004440471846187096, "rouge2_fmeasure": 0.05432411992031246, "rouge2_fmeasure_stderr": 0.0017179496173428985, "rouge2_precision": 0.039876602220101885, "rouge2_precision_stderr": 0.0013408657891869856, "rouge2_recall": 0.09373979355812566, "rouge2_recall_stderr": 0.003020539395998483, "rougeL_fmeasure": 0.16451405093380472, "rougeL_fmeasure_stderr": 0.0019625362254798284, "rougeL_precision": 0.12161602298528985, "rougeL_precision_stderr": 0.0016687848022286617, "rougeL_recall": 0.277756692596081, "rougeL_recall_stderr": 0.003476817201900778, "rougeLsum_fmeasure": 0.1678765700309988, "rougeLsum_fmeasure_stderr": 0.002214619502903411, "rougeLsum_precision": 0.12390275070243846, "rougeLsum_precision_stderr": 0.001821194882097903, "rougeLsum_recall": 0.2841653396002695, "rougeLsum_recall_stderr": 0.003918981158721574}}, "1": {"article_DOC_summary": {"bleu": 1.3775738726107656, "bleu_stderr": 0.08168674122275797, "rouge1_fmeasure": 0.16895420412623663, "rouge1_fmeasure_stderr": 0.002594837744965486, "rouge1_precision": 0.12010749072738239, "rouge1_precision_stderr": 0.0019038452514628232, "rouge1_recall": 0.2963599272578747, "rouge1_recall_stderr": 0.004492167665019704, "rouge2_fmeasure": 0.0333977078622104, "rouge2_fmeasure_stderr": 0.00144862051823388, "rouge2_precision": 0.02342764378945351, "rouge2_precision_stderr": 0.001019840391470285, "rouge2_recall": 0.0606775250012725, "rouge2_recall_stderr": 0.002681867584351387, "rougeL_fmeasure": 0.1329671503634145, "rougeL_fmeasure_stderr": 0.0019756316848368602, "rougeL_precision": 0.09436779371850286, "rougeL_precision_stderr": 0.0014414694335729383, "rougeL_recall": 0.2345277453603614, "rougeL_recall_stderr": 0.003510304651172983, "rougeLsum_fmeasure": 0.1348677191851774, "rougeLsum_fmeasure_stderr": 0.0021296361098744635, "rougeLsum_precision": 0.09564640396609536, "rougeLsum_precision_stderr": 0.0015463429453530608, "rougeLsum_recall": 0.23817260800506607, "rougeLsum_recall_stderr": 0.0037920083635799903}}, "2": {"article_DOC_summary": {"bleu": 1.6436795917006544, "bleu_stderr": 0.07345990657620402, "rouge1_fmeasure": 0.183152447945667, "rouge1_fmeasure_stderr": 0.0026661841419304156, "rouge1_precision": 0.13018508734953355, "rouge1_precision_stderr": 0.0019680938941104625, "rouge1_recall": 0.3210711017407105, "rouge1_recall_stderr": 0.004611340413923777, "rouge2_fmeasure": 0.04006840275451415, "rouge2_fmeasure_stderr": 0.0015228269131846886, "rouge2_precision": 0.028176744941516615, "rouge2_precision_stderr": 0.001075491149082574, "rouge2_recall": 0.07240802386862864, "rouge2_recall_stderr": 0.002820788158486654, "rougeL_fmeasure": 0.14371195073214757, "rougeL_fmeasure_stderr": 0.0019879171616702124, "rougeL_precision": 0.10199545313071355, "rougeL_precision_stderr": 0.0014580281840802887, "rougeL_recall": 0.2531633381335362, "rougeL_recall_stderr": 0.003556665780315997, "rougeLsum_fmeasure": 0.14532030438832994, "rougeLsum_fmeasure_stderr": 0.002206439734918519, "rougeLsum_precision": 0.10303479198876377, "rougeLsum_precision_stderr": 0.0016024271297654439, "rougeLsum_recall": 0.2565576177699366, "rougeLsum_recall_stderr": 0.003974104626273052}}, "3": {"article_DOC_summary": {"bleu": 1.6496742062970557, "bleu_stderr": 0.10044687108522192, "rouge1_fmeasure": 0.17881939665307162, "rouge1_fmeasure_stderr": 0.0027596350151271876, "rouge1_precision": 0.13017675194898692, "rouge1_precision_stderr": 0.002198538499348046, "rouge1_recall": 0.30922236829508537, "rouge1_recall_stderr": 0.004838699494510795, "rouge2_fmeasure": 0.03850524479452543, "rouge2_fmeasure_stderr": 0.0015068733635772717, "rouge2_precision": 0.02760225846524049, "rouge2_precision_stderr": 0.0011161638977048621, "rouge2_recall": 0.06881042595527115, "rouge2_recall_stderr": 0.0028092871690327376, "rougeL_fmeasure": 0.13794374325281775, "rougeL_fmeasure_stderr": 0.0020813614818383725, "rougeL_precision": 0.10014486228665212, "rougeL_precision_stderr": 0.0016329144933538103, "rougeL_recall": 0.2399877011308486, "rougeL_recall_stderr": 0.0037910983726926846, "rougeLsum_fmeasure": 0.1407907014518307, "rougeLsum_fmeasure_stderr": 0.0022509491687602674, "rougeLsum_precision": 0.10223719953083442, "rougeLsum_precision_stderr": 0.001759902279193926, "rougeLsum_recall": 0.24547016572644775, "rougeLsum_recall_stderr": 0.004083522562686188}}, "4": {"article_DOC_summary": {"bleu": 0.8321034229031437, "bleu_stderr": 0.08945851965278599, "rouge1_fmeasure": 0.049656632315842844, "rouge1_fmeasure_stderr": 0.002856336743990567, "rouge1_precision": 0.04149135621677754, "rouge1_precision_stderr": 0.0026386378608042917, "rouge1_recall": 0.07750948792703138, "rouge1_recall_stderr": 0.004477188850582034, "rouge2_fmeasure": 0.010989532389756859, "rouge2_fmeasure_stderr": 0.001029101618129444, "rouge2_precision": 0.008961527810790616, "rouge2_precision_stderr": 0.0009890519095079862, "rouge2_recall": 0.017840955869101484, "rouge2_recall_stderr": 0.0016844183124156535, "rougeL_fmeasure": 0.0375802335882974, "rougeL_fmeasure_stderr": 0.002147921108027172, "rougeL_precision": 0.03154250432068281, "rougeL_precision_stderr": 0.0020252442164337293, "rougeL_recall": 0.05909979811818957, "rougeL_recall_stderr": 0.0034347562251169, "rougeLsum_fmeasure": 0.03944906528704634, "rougeLsum_fmeasure_stderr": 0.0022725008097993537, "rougeLsum_precision": 0.03303780727965199, "rougeLsum_precision_stderr": 0.002118654737857532, "rougeLsum_recall": 0.06214884490779202, "rougeLsum_recall_stderr": 0.003663125602489163}}, "5": {"article_DOC_summary": {"bleu": 3.1493567208404783e-39, "bleu_stderr": 1.4249121699400145e-34, "rouge1_fmeasure": 0.0028022236632419895, "rouge1_fmeasure_stderr": 0.0007863889401033125, "rouge1_precision": 0.003117323860723007, "rouge1_precision_stderr": 0.0008837148810752774, "rouge1_recall": 0.0026242515972514074, "rouge1_recall_stderr": 0.0007336964722264186, "rouge2_fmeasure": 0.0005154358964062439, "rouge2_fmeasure_stderr": 0.00023513093079516412, "rouge2_precision": 0.0005615954214733349, "rouge2_precision_stderr": 0.00024697557291310967, "rouge2_recall": 0.0004889607248097814, "rouge2_recall_stderr": 0.00023161252427707105, "rougeL_fmeasure": 0.002054859718325845, "rougeL_fmeasure_stderr": 0.0005764790804455003, "rougeL_precision": 0.002278284775437062, "rougeL_precision_stderr": 0.0006480855590254838, "rougeL_recall": 0.0019397313262420287, "rougeL_recall_stderr": 0.0005430737577124271, "rougeLsum_fmeasure": 0.002218076936186532, "rougeLsum_fmeasure_stderr": 0.0006333184346671743, "rougeLsum_precision": 0.0024716725936861457, "rougeLsum_precision_stderr": 0.0007191187737619554, "rougeLsum_recall": 0.0020845273284634726, "rougeLsum_recall_stderr": 0.0005920007456566033}}}}
perplexity/evaluation/rankeval/perplexity_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932573,0
3
+ anli_r2,acc,0.332,0.014899597242811476,0
4
+ anli_r3,acc,0.33666666666666667,0.01364760294240639,0
5
+ arc_challenge,acc,0.28498293515358364,0.013191348179838793,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.013438909184778757,0
7
+ arc_easy,acc,0.6031144781144782,0.010039236800583209,0
8
+ arc_easy,acc_norm,0.5332491582491582,0.010237073872130745,0
9
+ boolq,acc,0.5923547400611621,0.008594580270731613,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.1940928270042194,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4989046006771559,0.004989769436956927,0
14
+ hellaswag,acc_norm,0.6554471220872337,0.004742510354777903,0
15
+ piqa,acc,0.7616974972796517,0.009940334245876207,0
16
+ piqa,acc_norm,0.7693144722524483,0.009828959550983103,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.862,0.010912152632504411,0
19
+ sciq,acc_norm,0.782,0.013063179040595296,0
20
+ storycloze_2016,acc,0.7349011223944415,0.01020698782076139,0
21
+ winogrande,acc,0.5895816890292028,0.013825107120035865,0
perplexity/evaluation/rankeval/perplexity_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.335,
5
- "acc_stderr": 0.014933117490932573
6
- },
7
- "anli_r2": {
8
- "acc": 0.332,
9
- "acc_stderr": 0.014899597242811476
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.01364760294240639
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.1940928270042194
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4989046006771559,
26
- "acc_stderr": 0.004989769436956927,
27
- "acc_norm": 0.6554471220872337,
28
- "acc_norm_stderr": 0.004742510354777903
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.5895816890292028,
36
- "acc_stderr": 0.013825107120035865
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7349011223944415,
40
- "acc_stderr": 0.01020698782076139
41
- },
42
- "boolq": {
43
- "acc": 0.5923547400611621,
44
- "acc_stderr": 0.008594580270731613
45
- },
46
- "arc_easy": {
47
- "acc": 0.6031144781144782,
48
- "acc_stderr": 0.010039236800583209,
49
- "acc_norm": 0.5332491582491582,
50
- "acc_norm_stderr": 0.010237073872130745
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28498293515358364,
54
- "acc_stderr": 0.013191348179838793,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.013438909184778757
57
- },
58
- "sciq": {
59
- "acc": 0.862,
60
- "acc_stderr": 0.010912152632504411,
61
- "acc_norm": 0.782,
62
- "acc_norm_stderr": 0.013063179040595296
63
- },
64
- "piqa": {
65
- "acc": 0.7616974972796517,
66
- "acc_stderr": 0.009940334245876207,
67
- "acc_norm": 0.7693144722524483,
68
- "acc_norm_stderr": 0.009828959550983103
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity/evaluation/rankeval/perplexity_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.334,0.014922019523732958,0
3
+ anli_r2,acc,0.322,0.014782913600996673,0
4
+ anli_r3,acc,0.3425,0.013704669762934727,0
5
+ arc_challenge,acc,0.3037542662116041,0.013438909184778755,0
6
+ arc_challenge,acc_norm,0.32849829351535836,0.013724978465537363,0
7
+ arc_easy,acc,0.6266835016835017,0.009925009142802914,0
8
+ arc_easy,acc_norm,0.5782828282828283,0.010133255284012316,0
9
+ boolq,acc,0.6085626911314985,0.008536430524403954,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.34145702306079667,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.49591714797849035,0.004989615052547472,0
14
+ hellaswag,acc_norm,0.6609241187014538,0.004724281487819379,0
15
+ piqa,acc,0.7562568008705114,0.010017199471500617,0
16
+ piqa,acc_norm,0.7627856365614799,0.009924694933586371,0
17
+ rte,acc,0.5595667870036101,0.029882123363118716,0
18
+ sciq,acc,0.906,0.009233052000787733,0
19
+ sciq,acc_norm,0.887,0.010016552866696851,0
20
+ storycloze_2016,acc,0.7279529663281668,0.010290888060871242,0
21
+ winogrande,acc,0.5935280189423836,0.013804448697753375,0
perplexity/evaluation/rankeval/perplexity_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.014922019523732958
6
- },
7
- "anli_r2": {
8
- "acc": 0.322,
9
- "acc_stderr": 0.014782913600996673
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934727
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.34145702306079667
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.49591714797849035,
26
- "acc_stderr": 0.004989615052547472,
27
- "acc_norm": 0.6609241187014538,
28
- "acc_norm_stderr": 0.004724281487819379
29
- },
30
- "rte": {
31
- "acc": 0.5595667870036101,
32
- "acc_stderr": 0.029882123363118716
33
- },
34
- "winogrande": {
35
- "acc": 0.5935280189423836,
36
- "acc_stderr": 0.013804448697753375
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7279529663281668,
40
- "acc_stderr": 0.010290888060871242
41
- },
42
- "boolq": {
43
- "acc": 0.6085626911314985,
44
- "acc_stderr": 0.008536430524403954
45
- },
46
- "arc_easy": {
47
- "acc": 0.6266835016835017,
48
- "acc_stderr": 0.009925009142802914,
49
- "acc_norm": 0.5782828282828283,
50
- "acc_norm_stderr": 0.010133255284012316
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3037542662116041,
54
- "acc_stderr": 0.013438909184778755,
55
- "acc_norm": 0.32849829351535836,
56
- "acc_norm_stderr": 0.013724978465537363
57
- },
58
- "sciq": {
59
- "acc": 0.906,
60
- "acc_stderr": 0.009233052000787733,
61
- "acc_norm": 0.887,
62
- "acc_norm_stderr": 0.010016552866696851
63
- },
64
- "piqa": {
65
- "acc": 0.7562568008705114,
66
- "acc_stderr": 0.010017199471500617,
67
- "acc_norm": 0.7627856365614799,
68
- "acc_norm_stderr": 0.009924694933586371
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity/evaluation/rankeval/perplexity_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.33,0.014876872027456732,0
3
+ anli_r2,acc,0.341,0.014998131348402702,0
4
+ anli_r3,acc,0.32166666666666666,0.013490095282989526,0
5
+ arc_challenge,acc,0.30887372013651876,0.013501770929344003,0
6
+ arc_challenge,acc_norm,0.3302047781569966,0.013743085603760427,0
7
+ arc_easy,acc,0.6296296296296297,0.009908978578665757,0
8
+ arc_easy,acc_norm,0.6123737373737373,0.00999730791444761,0
9
+ boolq,acc,0.6244648318042814,0.008469774334938068,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.2736908716975162,,1
12
+ copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.4962158932483569,0.004989638507409918,0
14
+ hellaswag,acc_norm,0.6642103166699861,0.004713006072807722,0
15
+ piqa,acc,0.7573449401523396,0.010002002569708698,0
16
+ piqa,acc_norm,0.7714907508161044,0.009796313511829512,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.916,0.008776162089491122,0
19
+ sciq,acc_norm,0.898,0.009575368801653902,0
20
+ storycloze_2016,acc,0.7338321753073223,0.010220104800551206,0
21
+ winogrande,acc,0.5935280189423836,0.013804448697753375,0
perplexity/evaluation/rankeval/perplexity_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.33,
5
- "acc_stderr": 0.014876872027456732
6
- },
7
- "anli_r2": {
8
- "acc": 0.341,
9
- "acc_stderr": 0.014998131348402702
10
- },
11
- "anli_r3": {
12
- "acc": 0.32166666666666666,
13
- "acc_stderr": 0.013490095282989526
14
- },
15
- "cb": {
16
- "acc": 0.3392857142857143,
17
- "acc_stderr": 0.06384226561930825,
18
- "f1": 0.2736908716975162
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932262
23
- },
24
- "hellaswag": {
25
- "acc": 0.4962158932483569,
26
- "acc_stderr": 0.004989638507409918,
27
- "acc_norm": 0.6642103166699861,
28
- "acc_norm_stderr": 0.004713006072807722
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5935280189423836,
36
- "acc_stderr": 0.013804448697753375
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7338321753073223,
40
- "acc_stderr": 0.010220104800551206
41
- },
42
- "boolq": {
43
- "acc": 0.6244648318042814,
44
- "acc_stderr": 0.008469774334938068
45
- },
46
- "arc_easy": {
47
- "acc": 0.6296296296296297,
48
- "acc_stderr": 0.009908978578665757,
49
- "acc_norm": 0.6123737373737373,
50
- "acc_norm_stderr": 0.00999730791444761
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30887372013651876,
54
- "acc_stderr": 0.013501770929344003,
55
- "acc_norm": 0.3302047781569966,
56
- "acc_norm_stderr": 0.013743085603760427
57
- },
58
- "sciq": {
59
- "acc": 0.916,
60
- "acc_stderr": 0.008776162089491122,
61
- "acc_norm": 0.898,
62
- "acc_norm_stderr": 0.009575368801653902
63
- },
64
- "piqa": {
65
- "acc": 0.7573449401523396,
66
- "acc_stderr": 0.010002002569708698,
67
- "acc_norm": 0.7714907508161044,
68
- "acc_norm_stderr": 0.009796313511829512
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity/evaluation/rankeval/perplexity_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.313,0.014671272822977888,0
3
+ anli_r2,acc,0.356,0.015149042659306628,0
4
+ anli_r3,acc,0.335,0.013630871843821472,0
5
+ arc_challenge,acc,0.31569965870307165,0.013582571095815291,0
6
+ arc_challenge,acc_norm,0.3387372013651877,0.013830568927974332,0
7
+ arc_easy,acc,0.6384680134680135,0.00985850654316206,0
8
+ arc_easy,acc_norm,0.6195286195286195,0.00996230599205857,0
9
+ boolq,acc,0.6311926605504588,0.00843865607975907,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.4014346362172449,,1
12
+ copa,acc,0.85,0.03588702812826371,0
13
+ hellaswag,acc,0.4993029277036447,0.00498977656227611,0
14
+ hellaswag,acc_norm,0.6642103166699861,0.0047130060728077195,0
15
+ piqa,acc,0.7616974972796517,0.009940334245876207,0
16
+ piqa,acc_norm,0.7720348204570185,0.009788093832324908,0
17
+ rte,acc,0.5523465703971119,0.02993107036293953,0
18
+ sciq,acc,0.923,0.008434580140240662,0
19
+ sciq,acc_norm,0.913,0.00891686663074589,0
20
+ storycloze_2016,acc,0.738107963655799,0.010167174759499612,0
21
+ winogrande,acc,0.6006314127861089,0.013764933546717612,0
perplexity/evaluation/rankeval/perplexity_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.313,
5
- "acc_stderr": 0.014671272822977888
6
- },
7
- "anli_r2": {
8
- "acc": 0.356,
9
- "acc_stderr": 0.015149042659306628
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821472
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.4014346362172449
19
- },
20
- "copa": {
21
- "acc": 0.85,
22
- "acc_stderr": 0.03588702812826371
23
- },
24
- "hellaswag": {
25
- "acc": 0.4993029277036447,
26
- "acc_stderr": 0.00498977656227611,
27
- "acc_norm": 0.6642103166699861,
28
- "acc_norm_stderr": 0.0047130060728077195
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.02993107036293953
33
- },
34
- "winogrande": {
35
- "acc": 0.6006314127861089,
36
- "acc_stderr": 0.013764933546717612
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.738107963655799,
40
- "acc_stderr": 0.010167174759499612
41
- },
42
- "boolq": {
43
- "acc": 0.6311926605504588,
44
- "acc_stderr": 0.00843865607975907
45
- },
46
- "arc_easy": {
47
- "acc": 0.6384680134680135,
48
- "acc_stderr": 0.00985850654316206,
49
- "acc_norm": 0.6195286195286195,
50
- "acc_norm_stderr": 0.00996230599205857
51
- },
52
- "arc_challenge": {
53
- "acc": 0.31569965870307165,
54
- "acc_stderr": 0.013582571095815291,
55
- "acc_norm": 0.3387372013651877,
56
- "acc_norm_stderr": 0.013830568927974332
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240662,
61
- "acc_norm": 0.913,
62
- "acc_norm_stderr": 0.00891686663074589
63
- },
64
- "piqa": {
65
- "acc": 0.7616974972796517,
66
- "acc_stderr": 0.009940334245876207,
67
- "acc_norm": 0.7720348204570185,
68
- "acc_norm_stderr": 0.009788093832324908
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity/evaluation/rankeval/perplexity_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.341,0.014998131348402709,0
3
+ anli_r2,acc,0.358,0.01516792886540756,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404794,0
5
+ arc_challenge,acc,0.3165529010238908,0.01359243151906808,0
6
+ arc_challenge,acc_norm,0.3378839590443686,0.013822047922283509,0
7
+ arc_easy,acc,0.6506734006734006,0.009782853449399284,0
8
+ arc_easy,acc_norm,0.6300505050505051,0.009906656266021148,0
9
+ boolq,acc,0.6425076452599389,0.008382336069484898,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.41697135221649567,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4998008364867556,0.0049897810155954715,0
14
+ hellaswag,acc_norm,0.6693885680143398,0.004694718918225764,0
15
+ piqa,acc,0.7622415669205659,0.009932525779525489,0
16
+ piqa,acc_norm,0.779651795429815,0.009670535456853148,0
17
+ rte,acc,0.5523465703971119,0.02993107036293953,0
18
+ sciq,acc,0.935,0.007799733061832011,0
19
+ sciq,acc_norm,0.925,0.008333333333333364,0
20
+ storycloze_2016,acc,0.7471940138963121,0.01005054390987858,0
21
+ winogrande,acc,0.5887924230465666,0.013829128358676862,0
perplexity/evaluation/rankeval/perplexity_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.341,
5
- "acc_stderr": 0.014998131348402709
6
- },
7
- "anli_r2": {
8
- "acc": 0.358,
9
- "acc_stderr": 0.01516792886540756
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404794
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.41697135221649567
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4998008364867556,
26
- "acc_stderr": 0.0049897810155954715,
27
- "acc_norm": 0.6693885680143398,
28
- "acc_norm_stderr": 0.004694718918225764
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.02993107036293953
33
- },
34
- "winogrande": {
35
- "acc": 0.5887924230465666,
36
- "acc_stderr": 0.013829128358676862
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7471940138963121,
40
- "acc_stderr": 0.01005054390987858
41
- },
42
- "boolq": {
43
- "acc": 0.6425076452599389,
44
- "acc_stderr": 0.008382336069484898
45
- },
46
- "arc_easy": {
47
- "acc": 0.6506734006734006,
48
- "acc_stderr": 0.009782853449399284,
49
- "acc_norm": 0.6300505050505051,
50
- "acc_norm_stderr": 0.009906656266021148
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3165529010238908,
54
- "acc_stderr": 0.01359243151906808,
55
- "acc_norm": 0.3378839590443686,
56
- "acc_norm_stderr": 0.013822047922283509
57
- },
58
- "sciq": {
59
- "acc": 0.935,
60
- "acc_stderr": 0.007799733061832011,
61
- "acc_norm": 0.925,
62
- "acc_norm_stderr": 0.008333333333333364
63
- },
64
- "piqa": {
65
- "acc": 0.7622415669205659,
66
- "acc_stderr": 0.009932525779525489,
67
- "acc_norm": 0.779651795429815,
68
- "acc_norm_stderr": 0.009670535456853148
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity/evaluation/rankeval/perplexity_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.315,0.014696631960792487,0
3
+ anli_r2,acc,0.333,0.014910846164229863,0
4
+ anli_r3,acc,0.32666666666666666,0.013544340907003667,0
5
+ arc_challenge,acc,0.3250853242320819,0.013688147309729119,0
6
+ arc_challenge,acc_norm,0.3515358361774744,0.013952413699600938,0
7
+ arc_easy,acc,0.6447811447811448,0.009820245899287117,0
8
+ arc_easy,acc_norm,0.627104377104377,0.009922743197129238,0
9
+ boolq,acc,0.634862385321101,0.008420941009417812,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.3398268398268398,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4982075283808006,0.004989749347461089,0
14
+ hellaswag,acc_norm,0.6702848038239394,0.004691488813032134,0
15
+ piqa,acc,0.763873775843308,0.009908965890558213,0
16
+ piqa,acc_norm,0.7834602829162133,0.009609984714384593,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.935,0.007799733061832013,0
19
+ sciq,acc_norm,0.933,0.007910345983177549,0
20
+ storycloze_2016,acc,0.7477284874398717,0.010043504206387307,0
21
+ winogrande,acc,0.5935280189423836,0.013804448697753375,0
perplexity/evaluation/rankeval/perplexity_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.315,
5
- "acc_stderr": 0.014696631960792487
6
- },
7
- "anli_r2": {
8
- "acc": 0.333,
9
- "acc_stderr": 0.014910846164229863
10
- },
11
- "anli_r3": {
12
- "acc": 0.32666666666666666,
13
- "acc_stderr": 0.013544340907003667
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3398268398268398
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4982075283808006,
26
- "acc_stderr": 0.004989749347461089,
27
- "acc_norm": 0.6702848038239394,
28
- "acc_norm_stderr": 0.004691488813032134
29
- },
30
- "rte": {
31
- "acc": 0.5703971119133574,
32
- "acc_stderr": 0.02979666882912467
33
- },
34
- "winogrande": {
35
- "acc": 0.5935280189423836,
36
- "acc_stderr": 0.013804448697753375
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7477284874398717,
40
- "acc_stderr": 0.010043504206387307
41
- },
42
- "boolq": {
43
- "acc": 0.634862385321101,
44
- "acc_stderr": 0.008420941009417812
45
- },
46
- "arc_easy": {
47
- "acc": 0.6447811447811448,
48
- "acc_stderr": 0.009820245899287117,
49
- "acc_norm": 0.627104377104377,
50
- "acc_norm_stderr": 0.009922743197129238
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3250853242320819,
54
- "acc_stderr": 0.013688147309729119,
55
- "acc_norm": 0.3515358361774744,
56
- "acc_norm_stderr": 0.013952413699600938
57
- },
58
- "sciq": {
59
- "acc": 0.935,
60
- "acc_stderr": 0.007799733061832013,
61
- "acc_norm": 0.933,
62
- "acc_norm_stderr": 0.007910345983177549
63
- },
64
- "piqa": {
65
- "acc": 0.763873775843308,
66
- "acc_stderr": 0.009908965890558213,
67
- "acc_norm": 0.7834602829162133,
68
- "acc_norm_stderr": 0.009609984714384593
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.09956430250623664
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.09956430250623664
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17546734923272342
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17546734923272342
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.21686040386653943
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.21686040386653943
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2107633646828048
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2107633646828048
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2120679639683205
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2120679639683205
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2132756026241378
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2132756026241378
14
+ e2e_nlg_cleaned,5,average,multiple,0.18799983114679378
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.056907827559785344
16
+ gem_xsum,0,median,rouge2_fmeasure,0.056907827559785344
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.052109640269165904
18
+ gem_xsum,1,median,rouge2_fmeasure,0.052109640269165904
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.0552385842587779
20
+ gem_xsum,2,median,rouge2_fmeasure,0.0552385842587779
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05215230425231278
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05215230425231278
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014711826789814367
24
+ gem_xsum,4,median,rouge2_fmeasure,0.014711826789814367
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0006713467768799879
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0006713467768799879
27
+ gem_xsum,5,average,multiple,0.03863192165112271
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04860923158958796
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04860923158958796
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05451596763365407
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05451596763365407
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05478096245624173
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05478096245624173
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05609871570970779
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05609871570970779
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05640172523608126
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05640172523608126
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05650487878947583
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05650487878947583
40
+ web_nlg_en,5,average,multiple,0.05448524690245811
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.031517777959615835
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.031517777959615835
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05327544506657774
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05327544506657774
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.057864776743445145
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.057864776743445145
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04723677657570569
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04723677657570569
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014808001674755853
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.014808001674755853
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0021982092172172316
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0021982092172172316
53
+ wiki_lingua_en,5,average,multiple,0.03448349787288625
perplexity25/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3056985698060885, "bleu_stderr": 0.02037727591015296, "rouge1_fmeasure": 0.1041010135673265, "rouge1_fmeasure_stderr": 0.0020426766002203623, "rouge1_precision": 0.07003252145378697, "rouge1_precision_stderr": 0.0017782398203109468, "rouge1_recall": 0.29203668815673745, "rouge1_recall_stderr": 0.00463136764747632, "rouge2_fmeasure": 0.04860923158958796, "rouge2_fmeasure_stderr": 0.0012225487600062404, "rouge2_precision": 0.0332378018219376, "rouge2_precision_stderr": 0.0011349058010884613, "rouge2_recall": 0.13749957036557914, "rouge2_recall_stderr": 0.003196748550458682, "rougeL_fmeasure": 0.10041814595908975, "rougeL_fmeasure_stderr": 0.0019026376308031996, "rougeL_precision": 0.06718224746092341, "rougeL_precision_stderr": 0.0016048036895014346, "rougeL_recall": 0.284110989145015, "rougeL_recall_stderr": 0.004532716214582295, "rougeLsum_fmeasure": 0.0995435574009657, "rougeLsum_fmeasure_stderr": 0.0019249335289248614, "rougeLsum_precision": 0.06682529423751103, "rougeLsum_precision_stderr": 0.0016323321821141092, "rougeLsum_recall": 0.279078746288203, "rougeLsum_recall_stderr": 0.004459389412998906}}, "1": {"PALM_prompt": {"bleu": 0.47118961796111253, "bleu_stderr": 0.03698103082931082, "rouge1_fmeasure": 0.1177743328825882, "rouge1_fmeasure_stderr": 0.001968188676645897, "rouge1_precision": 0.0771933656618608, "rouge1_precision_stderr": 0.0015982731846139909, "rouge1_recall": 0.3718112290369406, "rouge1_recall_stderr": 0.005070882434958457, "rouge2_fmeasure": 0.05451596763365407, "rouge2_fmeasure_stderr": 0.001220440063099909, "rouge2_precision": 0.036081371178737776, "rouge2_precision_stderr": 0.001015398673414324, "rouge2_recall": 0.1783833017712426, "rouge2_recall_stderr": 0.003533942569843169, "rougeL_fmeasure": 0.11130332023816736, "rougeL_fmeasure_stderr": 0.0018115727050860737, "rougeL_precision": 0.07295630439059424, "rougeL_precision_stderr": 0.0014912307741854292, "rougeL_recall": 0.34998602256792577, "rougeL_recall_stderr": 0.004635684392541653, "rougeLsum_fmeasure": 0.1123812114309373, "rougeLsum_fmeasure_stderr": 0.001868171085940279, "rougeLsum_precision": 0.07379279366484903, "rougeLsum_precision_stderr": 0.0015372531917501194, "rougeLsum_recall": 0.352791769527236, "rougeLsum_recall_stderr": 0.004687142295790851}}, "2": {"PALM_prompt": {"bleu": 0.533916369829073, "bleu_stderr": 0.03236121557558326, "rouge1_fmeasure": 0.11840000027377959, "rouge1_fmeasure_stderr": 0.0018926983365336872, "rouge1_precision": 0.07681910931236588, "rouge1_precision_stderr": 0.0015377577308842757, "rouge1_recall": 0.38510316670877215, "rouge1_recall_stderr": 0.00510951140732995, "rouge2_fmeasure": 0.05478096245624173, "rouge2_fmeasure_stderr": 0.0011716039831968148, "rouge2_precision": 0.035783217444623636, "rouge2_precision_stderr": 0.000976654693795426, "rouge2_recall": 0.18984984078562042, "rouge2_recall_stderr": 0.0038247509890711063, "rougeL_fmeasure": 0.1109268884100491, "rougeL_fmeasure_stderr": 0.0017396910638209685, "rougeL_precision": 0.0720673505406942, "rougeL_precision_stderr": 0.0014392854438264812, "rougeL_recall": 0.3590172306490562, "rougeL_recall_stderr": 0.004627049792255802, "rougeLsum_fmeasure": 0.11265574136485931, "rougeLsum_fmeasure_stderr": 0.0018002243925544347, "rougeLsum_precision": 0.07321894524691384, "rougeLsum_precision_stderr": 0.0014825495999582067, "rougeLsum_recall": 0.36495395152990295, "rougeLsum_recall_stderr": 0.0047347356869845165}}, "3": {"PALM_prompt": {"bleu": 0.5595499974128665, "bleu_stderr": 0.037451927861185196, "rouge1_fmeasure": 0.1195630864031263, "rouge1_fmeasure_stderr": 0.00192698764288485, "rouge1_precision": 0.0779317898938085, "rouge1_precision_stderr": 0.0016112650938220434, "rouge1_recall": 0.38762436841783615, "rouge1_recall_stderr": 0.0050758147406836035, "rouge2_fmeasure": 0.05609871570970779, "rouge2_fmeasure_stderr": 0.0012335930575023653, "rouge2_precision": 0.03687332306481112, "rouge2_precision_stderr": 0.0010617608016665735, "rouge2_recall": 0.19234698280186005, "rouge2_recall_stderr": 0.003750582513370695, "rougeL_fmeasure": 0.1120155525419754, "rougeL_fmeasure_stderr": 0.001772429101944478, "rougeL_precision": 0.073041109974632, "rougeL_precision_stderr": 0.0014857335049320345, "rougeL_recall": 0.36061760063023685, "rougeL_recall_stderr": 0.004568863945261958, "rougeLsum_fmeasure": 0.11375064173975076, "rougeLsum_fmeasure_stderr": 0.0018326619772365102, "rougeLsum_precision": 0.07415983300145405, "rougeLsum_precision_stderr": 0.001517950201385796, "rougeLsum_recall": 0.3676109059529136, "rougeLsum_recall_stderr": 0.004730150437103061}}, "4": {"PALM_prompt": {"bleu": 0.600990283111048, "bleu_stderr": 0.034277269466224586, "rouge1_fmeasure": 0.12037133162822398, "rouge1_fmeasure_stderr": 0.00182661073083092, "rouge1_precision": 0.07788179321838232, "rouge1_precision_stderr": 0.0015013142830281594, "rouge1_recall": 0.39994843389146706, "rouge1_recall_stderr": 0.00499808834668635, "rouge2_fmeasure": 0.05640172523608126, "rouge2_fmeasure_stderr": 0.001156416498416956, "rouge2_precision": 0.036729400826605405, "rouge2_precision_stderr": 0.0009707795653244158, "rouge2_recall": 0.20068990178179008, "rouge2_recall_stderr": 0.0038369503757506584, "rougeL_fmeasure": 0.11217146023041902, "rougeL_fmeasure_stderr": 0.001709041887848104, "rougeL_precision": 0.0727796364763943, "rougeL_precision_stderr": 0.0014273675233445258, "rougeL_recall": 0.3686191869505379, "rougeL_recall_stderr": 0.004446290243901058, "rougeLsum_fmeasure": 0.11462098219560242, "rougeLsum_fmeasure_stderr": 0.0017512279921868023, "rougeLsum_precision": 0.07432987376050701, "rougeLsum_precision_stderr": 0.0014578128593730135, "rougeLsum_recall": 0.3785969497473814, "rougeLsum_recall_stderr": 0.004640411180240199}}, "5": {"PALM_prompt": {"bleu": 0.6493610426477754, "bleu_stderr": 0.04632676737275454, "rouge1_fmeasure": 0.12068693871021649, "rouge1_fmeasure_stderr": 0.0017912623734371875, "rouge1_precision": 0.07748205634617567, "rouge1_precision_stderr": 0.001453985027637336, "rouge1_recall": 0.41054791184729794, "rouge1_recall_stderr": 0.004995597119369392, "rouge2_fmeasure": 0.05650487878947583, "rouge2_fmeasure_stderr": 0.0011343334162232442, "rouge2_precision": 0.03639203356358758, "rouge2_precision_stderr": 0.0009275341362302266, "rouge2_recall": 0.20615921276358995, "rouge2_recall_stderr": 0.0037483811580489817, "rougeL_fmeasure": 0.11124019861288943, "rougeL_fmeasure_stderr": 0.0016486705656890794, "rougeL_precision": 0.0716220983354236, "rougeL_precision_stderr": 0.0013593103462355548, "rougeL_recall": 0.3738691644956053, "rougeL_recall_stderr": 0.004316527656281062, "rougeLsum_fmeasure": 0.11456244519233316, "rougeLsum_fmeasure_stderr": 0.0017091885356239022, "rougeLsum_precision": 0.07368867272301145, "rougeLsum_precision_stderr": 0.0014007140781525382, "rougeLsum_recall": 0.38732626297948775, "rougeLsum_recall_stderr": 0.004596911969728994}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.3330159057137432, "bleu_stderr": 0.0488742999692044, "rouge1_fmeasure": 0.165509440090111, "rouge1_fmeasure_stderr": 0.001885757880608957, "rouge1_precision": 0.141331703238122, "rouge1_precision_stderr": 0.001900860421203431, "rouge1_recall": 0.24159721817911073, "rouge1_recall_stderr": 0.002719597450403811, "rouge2_fmeasure": 0.031517777959615835, "rouge2_fmeasure_stderr": 0.0008078153472147894, "rouge2_precision": 0.02669836310001385, "rouge2_precision_stderr": 0.0007156929489602658, "rouge2_recall": 0.04779652460533507, "rouge2_recall_stderr": 0.0013664090841134524, "rougeL_fmeasure": 0.1304904422471148, "rougeL_fmeasure_stderr": 0.0013630786843314023, "rougeL_precision": 0.10997003023895942, "rougeL_precision_stderr": 0.0013406611259386123, "rougeL_recall": 0.19583040922151732, "rougeL_recall_stderr": 0.002241329057684982, "rougeLsum_fmeasure": 0.1520296742001466, "rougeLsum_fmeasure_stderr": 0.0017239456047265416, "rougeLsum_precision": 0.1296243531663802, "rougeLsum_precision_stderr": 0.0017327493840537764, "rougeLsum_recall": 0.2226802105122421, "rougeLsum_recall_stderr": 0.002519165452551483}}, "1": {"tldr_en": {"bleu": 2.6461954540180517, "bleu_stderr": 0.0686607101579094, "rouge1_fmeasure": 0.21776898864919678, "rouge1_fmeasure_stderr": 0.0019444140343936993, "rouge1_precision": 0.19155876571103367, "rouge1_precision_stderr": 0.002235574181803164, "rouge1_recall": 0.31538809129393497, "rouge1_recall_stderr": 0.0027758196777476055, "rouge2_fmeasure": 0.05327544506657774, "rouge2_fmeasure_stderr": 0.0009940356897765103, "rouge2_precision": 0.04732904074441193, "rouge2_precision_stderr": 0.0010360766648087926, "rouge2_recall": 0.07972267565974463, "rouge2_recall_stderr": 0.0016475159287438514, "rougeL_fmeasure": 0.1540568103772324, "rougeL_fmeasure_stderr": 0.0012940294587542422, "rougeL_precision": 0.13475235285926426, "rougeL_precision_stderr": 0.0015468525973368185, "rougeL_recall": 0.22894884785015995, "rougeL_recall_stderr": 0.002173977332942091, "rougeLsum_fmeasure": 0.20425015913438996, "rougeLsum_fmeasure_stderr": 0.0018113854464497062, "rougeLsum_precision": 0.17953406416101728, "rougeLsum_precision_stderr": 0.0020944383459417203, "rougeLsum_recall": 0.29675466574734266, "rougeLsum_recall_stderr": 0.002631907411205019}}, "2": {"tldr_en": {"bleu": 2.901173035530729, "bleu_stderr": 0.05342955865413849, "rouge1_fmeasure": 0.22064842886466463, "rouge1_fmeasure_stderr": 0.001912119893056796, "rouge1_precision": 0.2159533628570682, "rouge1_precision_stderr": 0.002791865511024776, "rouge1_recall": 0.30331115752873467, "rouge1_recall_stderr": 0.002766040617891812, "rouge2_fmeasure": 0.057864776743445145, "rouge2_fmeasure_stderr": 0.0010769988684922042, "rouge2_precision": 0.05993156312422532, "rouge2_precision_stderr": 0.0016401679053386375, "rouge2_recall": 0.08140349201064899, "rouge2_recall_stderr": 0.0016894533790392091, "rougeL_fmeasure": 0.16022824117745368, "rougeL_fmeasure_stderr": 0.001353671145413634, "rougeL_precision": 0.1583382277260483, "rougeL_precision_stderr": 0.0022683450533058772, "rougeL_recall": 0.22488747911377632, "rougeL_recall_stderr": 0.002243037827514705, "rougeLsum_fmeasure": 0.20741540180362425, "rougeLsum_fmeasure_stderr": 0.001786464041080481, "rougeLsum_precision": 0.20328152598542265, "rougeLsum_precision_stderr": 0.002668656156379896, "rougeLsum_recall": 0.2859559600281066, "rougeLsum_recall_stderr": 0.002647687376069602}}, "3": {"tldr_en": {"bleu": 2.8536144517225503, "bleu_stderr": 0.11123670367846313, "rouge1_fmeasure": 0.1795577823082448, "rouge1_fmeasure_stderr": 0.0022472758798550777, "rouge1_precision": 0.18489561929921422, "rouge1_precision_stderr": 0.003137550370770306, "rouge1_recall": 0.24573209214855274, "rouge1_recall_stderr": 0.003309411692082243, "rouge2_fmeasure": 0.04723677657570569, "rouge2_fmeasure_stderr": 0.0010843010239386048, "rouge2_precision": 0.05085763209734922, "rouge2_precision_stderr": 0.0016343790426841317, "rouge2_recall": 0.06635653571954948, "rouge2_recall_stderr": 0.0016849742371905798, "rougeL_fmeasure": 0.13192124238531389, "rougeL_fmeasure_stderr": 0.001640698669618877, "rougeL_precision": 0.13823892948584393, "rougeL_precision_stderr": 0.002555329177823708, "rougeL_recall": 0.18353805339680163, "rougeL_recall_stderr": 0.002620675134490683, "rougeLsum_fmeasure": 0.16886339863190594, "rougeLsum_fmeasure_stderr": 0.0021148662932854957, "rougeLsum_precision": 0.17421610987286962, "rougeLsum_precision_stderr": 0.0029928898493940675, "rougeLsum_recall": 0.23156674787506518, "rougeLsum_recall_stderr": 0.003147427389660552}}, "4": {"tldr_en": {"bleu": 0.5390525330469651, "bleu_stderr": 0.0408109714920231, "rouge1_fmeasure": 0.05703592133005884, "rouge1_fmeasure_stderr": 0.001942831946209364, "rouge1_precision": 0.05870358383616785, "rouge1_precision_stderr": 0.0023430529684724016, "rouge1_recall": 0.08093434137408964, "rouge1_recall_stderr": 0.0028367817835456407, "rouge2_fmeasure": 0.014808001674755853, "rouge2_fmeasure_stderr": 0.0007238729065264632, "rouge2_precision": 0.016328545160307874, "rouge2_precision_stderr": 0.0011000353784742687, "rouge2_recall": 0.02180350520016562, "rouge2_recall_stderr": 0.0011411604156949377, "rougeL_fmeasure": 0.04260019909764127, "rougeL_fmeasure_stderr": 0.0014424624483373777, "rougeL_precision": 0.044888286166910714, "rougeL_precision_stderr": 0.001896455231106093, "rougeL_recall": 0.061444546156244624, "rougeL_recall_stderr": 0.0022065275178083242, "rougeLsum_fmeasure": 0.05352907711826245, "rougeLsum_fmeasure_stderr": 0.0018257144932056115, "rougeLsum_precision": 0.05510529257082465, "rougeLsum_precision_stderr": 0.0022153343918295677, "rougeLsum_recall": 0.07627563508448673, "rougeLsum_recall_stderr": 0.0026884645736360985}}, "5": {"tldr_en": {"bleu": 1.1864794949178182e-06, "bleu_stderr": 2.2172651577024026e-06, "rouge1_fmeasure": 0.008834057407774154, "rouge1_fmeasure_stderr": 0.0008234294750612254, "rouge1_precision": 0.008584563764438044, "rouge1_precision_stderr": 0.0008753066342352569, "rouge1_recall": 0.01304332354397915, "rouge1_recall_stderr": 0.0012616701774741266, "rouge2_fmeasure": 0.0021982092172172316, "rouge2_fmeasure_stderr": 0.0002702560692849953, "rouge2_precision": 0.0020643566994266315, "rouge2_precision_stderr": 0.0002999865807491949, "rouge2_recall": 0.003514834079841484, "rouge2_recall_stderr": 0.00046085843512874496, "rougeL_fmeasure": 0.006719281513421419, "rougeL_fmeasure_stderr": 0.0006225359173726148, "rougeL_precision": 0.0065299312398118745, "rougeL_precision_stderr": 0.0006600353263555559, "rougeL_recall": 0.010172425010043205, "rougeL_recall_stderr": 0.0010133996136714378, "rougeLsum_fmeasure": 0.008173501201069725, "rougeLsum_fmeasure_stderr": 0.0007588029707814247, "rougeLsum_precision": 0.007865790760235851, "rougeLsum_precision_stderr": 0.0007878955854092591, "rougeLsum_recall": 0.012211564420719438, "rougeLsum_recall_stderr": 0.0011927076918159406}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 4.765682760423997, "bleu_stderr": 0.05422767541309084, "rouge1_fmeasure": 0.24332659914676713, "rouge1_fmeasure_stderr": 0.0019769029717195945, "rouge1_precision": 0.17988772034955686, "rouge1_precision_stderr": 0.0016071110305014198, "rouge1_recall": 0.39581785916900913, "rouge1_recall_stderr": 0.0029310989695743622, "rouge2_fmeasure": 0.09956430250623664, "rouge2_fmeasure_stderr": 0.0012064988114572355, "rouge2_precision": 0.07299811599029202, "rouge2_precision_stderr": 0.0009192181950106493, "rouge2_recall": 0.1655787870611749, "rouge2_recall_stderr": 0.0019941826207487525, "rougeL_fmeasure": 0.2048927917623883, "rougeL_fmeasure_stderr": 0.0015629945995320966, "rougeL_precision": 0.1508460812164152, "rougeL_precision_stderr": 0.0012497621473603125, "rougeL_recall": 0.33638856499484304, "rougeL_recall_stderr": 0.002462317442145038, "rougeLsum_fmeasure": 0.21696200359533935, "rougeLsum_fmeasure_stderr": 0.0018532122825693418, "rougeLsum_precision": 0.16020564267536394, "rougeLsum_precision_stderr": 0.0014848327378428146, "rougeLsum_recall": 0.35379035235820266, "rougeLsum_recall_stderr": 0.0028050151273689558}}, "1": {"generate_text_restaurant": {"bleu": 10.468883179786745, "bleu_stderr": 0.14513499791603968, "rouge1_fmeasure": 0.41142927258080764, "rouge1_fmeasure_stderr": 0.0020372645566246617, "rouge1_precision": 0.4461560307039274, "rouge1_precision_stderr": 0.0027882222137359425, "rouge1_recall": 0.4223727993701862, "rouge1_recall_stderr": 0.002779466413933148, "rouge2_fmeasure": 0.17546734923272342, "rouge2_fmeasure_stderr": 0.0017675801853763418, "rouge2_precision": 0.19138036565463667, "rouge2_precision_stderr": 0.0021373415792973888, "rouge2_recall": 0.18105398962411778, "rouge2_recall_stderr": 0.0020434906060036568, "rougeL_fmeasure": 0.29720682946492366, "rougeL_fmeasure_stderr": 0.0018058222015338043, "rougeL_precision": 0.3234887060013004, "rougeL_precision_stderr": 0.002422772594833981, "rougeL_recall": 0.3050111498702241, "rougeL_recall_stderr": 0.0023058891158011475, "rougeLsum_fmeasure": 0.3408169753521894, "rougeLsum_fmeasure_stderr": 0.0020222041567187483, "rougeLsum_precision": 0.36955424114426116, "rougeLsum_precision_stderr": 0.002627564057426632, "rougeLsum_recall": 0.35002355749667186, "rougeLsum_recall_stderr": 0.0025913666710418912}}, "2": {"generate_text_restaurant": {"bleu": 13.15937757187936, "bleu_stderr": 0.17402618441844517, "rouge1_fmeasure": 0.44974386997659066, "rouge1_fmeasure_stderr": 0.0021587456909989825, "rouge1_precision": 0.5116677117570123, "rouge1_precision_stderr": 0.003328551040183069, "rouge1_recall": 0.4467016147483925, "rouge1_recall_stderr": 0.0028150384160634126, "rouge2_fmeasure": 0.21686040386653943, "rouge2_fmeasure_stderr": 0.0019364096185227812, "rouge2_precision": 0.25116353200124875, "rouge2_precision_stderr": 0.002675343325002614, "rouge2_recall": 0.21531631345677743, "rouge2_recall_stderr": 0.0021418289381007287, "rougeL_fmeasure": 0.3389361888586281, "rougeL_fmeasure_stderr": 0.001994011964068714, "rougeL_precision": 0.3875589106822081, "rougeL_precision_stderr": 0.0029960409984381296, "rougeL_recall": 0.3362068824396353, "rougeL_recall_stderr": 0.0024065124976756567, "rougeLsum_fmeasure": 0.38104681490559605, "rougeLsum_fmeasure_stderr": 0.0021875307111738898, "rougeLsum_precision": 0.43414987277433165, "rougeLsum_precision_stderr": 0.0032063239226803007, "rougeLsum_recall": 0.37817721348532046, "rougeLsum_recall_stderr": 0.0026544745921995615}}, "3": {"generate_text_restaurant": {"bleu": 12.129694716562593, "bleu_stderr": 0.09885740494020588, "rouge1_fmeasure": 0.4414312727012384, "rouge1_fmeasure_stderr": 0.0019519907059499023, "rouge1_precision": 0.4445157890248481, "rouge1_precision_stderr": 0.002595665567685936, "rouge1_recall": 0.47798126521143697, "rouge1_recall_stderr": 0.0027344918481641326, "rouge2_fmeasure": 0.2107633646828048, "rouge2_fmeasure_stderr": 0.0018036725966383619, "rouge2_precision": 0.2128182501314406, "rouge2_precision_stderr": 0.002080514621594728, "rouge2_recall": 0.23033986601501272, "rouge2_recall_stderr": 0.002220133805877886, "rougeL_fmeasure": 0.32903441642513187, "rougeL_fmeasure_stderr": 0.001780398769078088, "rougeL_precision": 0.33200437713670644, "rougeL_precision_stderr": 0.002293852098406482, "rougeL_recall": 0.35658301284722854, "rougeL_recall_stderr": 0.0023733016537168627, "rougeLsum_fmeasure": 0.3753007198038583, "rougeLsum_fmeasure_stderr": 0.0020106738190462125, "rougeLsum_precision": 0.37782185980378685, "rougeLsum_precision_stderr": 0.0025010018735451525, "rougeLsum_recall": 0.40662395947064806, "rougeLsum_recall_stderr": 0.002656543475373511}}, "4": {"generate_text_restaurant": {"bleu": 12.132825433584408, "bleu_stderr": 0.16624627774955542, "rouge1_fmeasure": 0.44537146771645886, "rouge1_fmeasure_stderr": 0.001904249537543325, "rouge1_precision": 0.43551787708171735, "rouge1_precision_stderr": 0.0022984268679554208, "rouge1_recall": 0.488774946659327, "rouge1_recall_stderr": 0.0026403071353862067, "rouge2_fmeasure": 0.2120679639683205, "rouge2_fmeasure_stderr": 0.0017947484063169713, "rouge2_precision": 0.20679648264384795, "rouge2_precision_stderr": 0.0018808563087133726, "rouge2_recall": 0.23542355545577054, "rouge2_recall_stderr": 0.0022374175074990257, "rougeL_fmeasure": 0.33096010620007504, "rougeL_fmeasure_stderr": 0.0017733714402514843, "rougeL_precision": 0.32327382970798735, "rougeL_precision_stderr": 0.001995377079294585, "rougeL_recall": 0.36425119832827263, "rougeL_recall_stderr": 0.0023824504483212824, "rougeLsum_fmeasure": 0.380150071138794, "rougeLsum_fmeasure_stderr": 0.0019990851143011548, "rougeLsum_precision": 0.37161856664153076, "rougeLsum_precision_stderr": 0.0022688211821483505, "rougeLsum_recall": 0.41744652285901757, "rougeLsum_recall_stderr": 0.0026213051295942776}}, "5": {"generate_text_restaurant": {"bleu": 12.150501863631494, "bleu_stderr": 0.11314994370401227, "rouge1_fmeasure": 0.44709276979411555, "rouge1_fmeasure_stderr": 0.001910905514583041, "rouge1_precision": 0.43384350876692923, "rouge1_precision_stderr": 0.0022430721979742672, "rouge1_recall": 0.4930603001034525, "rouge1_recall_stderr": 0.002670829206562973, "rouge2_fmeasure": 0.2132756026241378, "rouge2_fmeasure_stderr": 0.001789068514228881, "rouge2_precision": 0.20616106117256064, "rouge2_precision_stderr": 0.001830057336865641, "rouge2_recall": 0.23792096807821955, "rouge2_recall_stderr": 0.00223350281305114, "rougeL_fmeasure": 0.3325234854628361, "rougeL_fmeasure_stderr": 0.0017707439403097452, "rougeL_precision": 0.3221029481308322, "rougeL_precision_stderr": 0.0019290523867159137, "rougeL_recall": 0.36806957352620984, "rougeL_recall_stderr": 0.0024290518235640574, "rougeLsum_fmeasure": 0.38137613302372186, "rougeLsum_fmeasure_stderr": 0.0020158755209571493, "rougeLsum_precision": 0.36995123008018044, "rougeLsum_precision_stderr": 0.002227439242864291, "rougeLsum_recall": 0.42090674888926755, "rougeLsum_recall_stderr": 0.002665490051406233}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.3861113355807007, "bleu_stderr": 0.11074519195435138, "rouge1_fmeasure": 0.22169655207824404, "rouge1_fmeasure_stderr": 0.002737165165866136, "rouge1_precision": 0.16949773638289, "rouge1_precision_stderr": 0.0025008930161999152, "rouge1_recall": 0.35927434904922223, "rouge1_recall_stderr": 0.004560400461326097, "rouge2_fmeasure": 0.056907827559785344, "rouge2_fmeasure_stderr": 0.0018251232736631654, "rouge2_precision": 0.0431701596864316, "rouge2_precision_stderr": 0.001534312955547038, "rouge2_recall": 0.09474053242204084, "rouge2_recall_stderr": 0.002989222654082048, "rougeL_fmeasure": 0.168504331059963, "rougeL_fmeasure_stderr": 0.0021699742342189397, "rougeL_precision": 0.1290524889805869, "rougeL_precision_stderr": 0.0020543657036148173, "rougeL_recall": 0.27361967658141045, "rougeL_recall_stderr": 0.0035963924946506493, "rougeLsum_fmeasure": 0.1751085620216996, "rougeLsum_fmeasure_stderr": 0.0024047356803931223, "rougeLsum_precision": 0.13370844006346885, "rougeLsum_precision_stderr": 0.0021670339989317035, "rougeLsum_recall": 0.285256434682157, "rougeLsum_recall_stderr": 0.00408399090358691}}, "1": {"article_DOC_summary": {"bleu": 2.1599927233517997, "bleu_stderr": 0.11630676231683935, "rouge1_fmeasure": 0.20562256846157667, "rouge1_fmeasure_stderr": 0.0025917247878476176, "rouge1_precision": 0.1463713881635103, "rouge1_precision_stderr": 0.001933537484388361, "rouge1_recall": 0.35985532344291576, "rouge1_recall_stderr": 0.0044711381442348866, "rouge2_fmeasure": 0.052109640269165904, "rouge2_fmeasure_stderr": 0.0016499123215979108, "rouge2_precision": 0.036703622860321694, "rouge2_precision_stderr": 0.001172772206888684, "rouge2_recall": 0.0939485415027206, "rouge2_recall_stderr": 0.003047610569508944, "rougeL_fmeasure": 0.15866900668787276, "rougeL_fmeasure_stderr": 0.0019458892417091485, "rougeL_precision": 0.1126929429063383, "rougeL_precision_stderr": 0.0014352356833621214, "rougeL_recall": 0.27967026507009923, "rougeL_recall_stderr": 0.0035535952255309837, "rougeLsum_fmeasure": 0.16374652691863595, "rougeLsum_fmeasure_stderr": 0.0021813353055902, "rougeLsum_precision": 0.1163220909216439, "rougeLsum_precision_stderr": 0.001603962253257027, "rougeLsum_recall": 0.2884084692973325, "rougeLsum_recall_stderr": 0.003919667134306644}}, "2": {"article_DOC_summary": {"bleu": 2.258614670093732, "bleu_stderr": 0.08951840120004514, "rouge1_fmeasure": 0.20934999828708714, "rouge1_fmeasure_stderr": 0.0026521179445819754, "rouge1_precision": 0.14912207031164892, "rouge1_precision_stderr": 0.001973097149015836, "rouge1_recall": 0.3654183781592464, "rouge1_recall_stderr": 0.004588246988437046, "rouge2_fmeasure": 0.0552385842587779, "rouge2_fmeasure_stderr": 0.0017598952748467285, "rouge2_precision": 0.038943395782983094, "rouge2_precision_stderr": 0.0012528357050940226, "rouge2_recall": 0.09918868504865098, "rouge2_recall_stderr": 0.003233414024371685, "rougeL_fmeasure": 0.162664466797653, "rougeL_fmeasure_stderr": 0.002068423484247533, "rougeL_precision": 0.1156734461385958, "rougeL_precision_stderr": 0.001526111534509325, "rougeL_recall": 0.28548150411787737, "rougeL_recall_stderr": 0.003702367860147114, "rougeLsum_fmeasure": 0.16589005254411232, "rougeLsum_fmeasure_stderr": 0.0022566042044053773, "rougeLsum_precision": 0.11782158647690419, "rougeLsum_precision_stderr": 0.001648802010987179, "rougeLsum_recall": 0.29187107619025976, "rougeLsum_recall_stderr": 0.004071548917864099}}, "3": {"article_DOC_summary": {"bleu": 2.2258566055479094, "bleu_stderr": 0.13278688909462125, "rouge1_fmeasure": 0.2009780831532814, "rouge1_fmeasure_stderr": 0.002806726765718572, "rouge1_precision": 0.14621341287651227, "rouge1_precision_stderr": 0.0022083149046151504, "rouge1_recall": 0.3455642084346473, "rouge1_recall_stderr": 0.004873293934425135, "rouge2_fmeasure": 0.05215230425231278, "rouge2_fmeasure_stderr": 0.0017120956588686437, "rouge2_precision": 0.037135604961774385, "rouge2_precision_stderr": 0.0012274885414975227, "rouge2_recall": 0.09277441654545833, "rouge2_recall_stderr": 0.0031426558683045865, "rougeL_fmeasure": 0.15645205789193842, "rougeL_fmeasure_stderr": 0.0021779473947538264, "rougeL_precision": 0.11361322830742238, "rougeL_precision_stderr": 0.0016918706245087672, "rougeL_recall": 0.27017363415694445, "rougeL_recall_stderr": 0.003905391020509053, "rougeLsum_fmeasure": 0.16002294003908046, "rougeLsum_fmeasure_stderr": 0.002408602292236863, "rougeLsum_precision": 0.11619780916522449, "rougeLsum_precision_stderr": 0.001872558794638193, "rougeLsum_recall": 0.2770186544920654, "rougeLsum_recall_stderr": 0.004312222731166786}}, "4": {"article_DOC_summary": {"bleu": 1.1689695560959992, "bleu_stderr": 0.14941170439296517, "rouge1_fmeasure": 0.05602682791591825, "rouge1_fmeasure_stderr": 0.0030566032233541435, "rouge1_precision": 0.04540495134964114, "rouge1_precision_stderr": 0.002577404539511274, "rouge1_recall": 0.08944999300372532, "rouge1_recall_stderr": 0.005043063336771766, "rouge2_fmeasure": 0.014711826789814367, "rouge2_fmeasure_stderr": 0.001150153366023636, "rouge2_precision": 0.011337811755364056, "rouge2_precision_stderr": 0.000979022200716812, "rouge2_recall": 0.02507829492852627, "rouge2_recall_stderr": 0.002014742970966847, "rougeL_fmeasure": 0.04340501623577573, "rougeL_fmeasure_stderr": 0.002382694314714795, "rougeL_precision": 0.03552240669275928, "rougeL_precision_stderr": 0.0020775346300726987, "rougeL_recall": 0.0694647286645038, "rougeL_recall_stderr": 0.003953679220478554, "rougeLsum_fmeasure": 0.045785794288580685, "rougeLsum_fmeasure_stderr": 0.002531815611533499, "rougeLsum_precision": 0.037416239535642694, "rougeLsum_precision_stderr": 0.002189640562314706, "rougeLsum_recall": 0.07349320517885673, "rougeLsum_recall_stderr": 0.004238026627054986}}, "5": {"article_DOC_summary": {"bleu": 3.968509960352518e-38, "bleu_stderr": 9.071697715642431e-34, "rouge1_fmeasure": 0.0031078634832113075, "rouge1_fmeasure_stderr": 0.0008672375952429783, "rouge1_precision": 0.0033252126043761066, "rouge1_precision_stderr": 0.0009227377103489034, "rouge1_recall": 0.0030093268798964366, "rouge1_recall_stderr": 0.0008438210949622407, "rouge2_fmeasure": 0.0006713467768799879, "rouge2_fmeasure_stderr": 0.0003135528125707898, "rouge2_precision": 0.0007101797265504257, "rouge2_precision_stderr": 0.0003230925998744655, "rouge2_recall": 0.0006426199586576944, "rouge2_recall_stderr": 0.0003070058829967824, "rougeL_fmeasure": 0.002288473742428132, "rougeL_fmeasure_stderr": 0.0006642689228610262, "rougeL_precision": 0.0024626201177109525, "rougeL_precision_stderr": 0.0007048985126880111, "rougeL_recall": 0.0021932876500475765, "rougeL_recall_stderr": 0.0006418079510766518, "rougeLsum_fmeasure": 0.0024871269676956433, "rougeLsum_fmeasure_stderr": 0.0007048637473097949, "rougeLsum_precision": 0.0026697560249461646, "rougeLsum_precision_stderr": 0.0007461825026075466, "rougeLsum_recall": 0.0023870797070471815, "rougeLsum_recall_stderr": 0.0006822657353438714}}}}
perplexity25/evaluation/rankeval/perplexity25_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.338,0.014965960710224482,0
3
+ anli_r2,acc,0.341,0.014998131348402699,0
4
+ anli_r3,acc,0.3491666666666667,0.013767075395077249,0
5
+ arc_challenge,acc,0.3054607508532423,0.013460080478002501,0
6
+ arc_challenge,acc_norm,0.3267918088737201,0.013706665975587336,0
7
+ arc_easy,acc,0.6430976430976431,0.009830630210347005,0
8
+ arc_easy,acc_norm,0.5686026936026936,0.010162752847747505,0
9
+ boolq,acc,0.5972477064220183,0.008578054401368405,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.33763440860215055,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4814777932682733,0.004986356526063966,0
14
+ hellaswag,acc_norm,0.6296554471220872,0.004819100456867812,0
15
+ piqa,acc,0.7415669205658324,0.010213971636773319,0
16
+ piqa,acc_norm,0.7421109902067464,0.010206956662056252,0
17
+ rte,acc,0.5884476534296029,0.029621832222417196,0
18
+ sciq,acc,0.869,0.010674874844837952,0
19
+ sciq,acc_norm,0.773,0.013253174964763921,0
20
+ storycloze_2016,acc,0.740780331373597,0.010133463176449564,0
21
+ winogrande,acc,0.611681136543015,0.01369745665845723,0
perplexity25/evaluation/rankeval/perplexity25_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.338,
5
- "acc_stderr": 0.014965960710224482
6
- },
7
- "anli_r2": {
8
- "acc": 0.341,
9
- "acc_stderr": 0.014998131348402699
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.013767075395077249
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.33763440860215055
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.4814777932682733,
26
- "acc_stderr": 0.004986356526063966,
27
- "acc_norm": 0.6296554471220872,
28
- "acc_norm_stderr": 0.004819100456867812
29
- },
30
- "rte": {
31
- "acc": 0.5884476534296029,
32
- "acc_stderr": 0.029621832222417196
33
- },
34
- "winogrande": {
35
- "acc": 0.611681136543015,
36
- "acc_stderr": 0.01369745665845723
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.740780331373597,
40
- "acc_stderr": 0.010133463176449564
41
- },
42
- "boolq": {
43
- "acc": 0.5972477064220183,
44
- "acc_stderr": 0.008578054401368405
45
- },
46
- "arc_easy": {
47
- "acc": 0.6430976430976431,
48
- "acc_stderr": 0.009830630210347005,
49
- "acc_norm": 0.5686026936026936,
50
- "acc_norm_stderr": 0.010162752847747505
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3054607508532423,
54
- "acc_stderr": 0.013460080478002501,
55
- "acc_norm": 0.3267918088737201,
56
- "acc_norm_stderr": 0.013706665975587336
57
- },
58
- "sciq": {
59
- "acc": 0.869,
60
- "acc_stderr": 0.010674874844837952,
61
- "acc_norm": 0.773,
62
- "acc_norm_stderr": 0.013253174964763921
63
- },
64
- "piqa": {
65
- "acc": 0.7415669205658324,
66
- "acc_stderr": 0.010213971636773319,
67
- "acc_norm": 0.7421109902067464,
68
- "acc_norm_stderr": 0.010206956662056252
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.329,0.014865395385928364,0
3
+ anli_r2,acc,0.323,0.01479492784334864,0
4
+ anli_r3,acc,0.3425,0.013704669762934725,0
5
+ arc_challenge,acc,0.3310580204778157,0.01375206241981784,0
6
+ arc_challenge,acc_norm,0.3575085324232082,0.014005494275916573,0
7
+ arc_easy,acc,0.6632996632996633,0.009697166595752472,0
8
+ arc_easy,acc_norm,0.6397306397306397,0.00985100258473238,0
9
+ boolq,acc,0.6412844036697247,0.0083886680340594,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.36209571547917413,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4794861581358295,0.0049855800659464565,0
14
+ hellaswag,acc_norm,0.6293567018522207,0.00481989994534249,0
15
+ piqa,acc,0.7388465723612623,0.010248738649935578,0
16
+ piqa,acc_norm,0.7301414581066377,0.010356595421852195,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.916,0.008776162089491115,0
19
+ sciq,acc_norm,0.899,0.009533618929340987,0
20
+ storycloze_2016,acc,0.7295563869588455,0.010271810373331027,0
21
+ winogrande,acc,0.6148382004735596,0.013676821287521412,0
perplexity25/evaluation/rankeval/perplexity25_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.329,
5
- "acc_stderr": 0.014865395385928364
6
- },
7
- "anli_r2": {
8
- "acc": 0.323,
9
- "acc_stderr": 0.01479492784334864
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934725
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.36209571547917413
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4794861581358295,
26
- "acc_stderr": 0.0049855800659464565,
27
- "acc_norm": 0.6293567018522207,
28
- "acc_norm_stderr": 0.00481989994534249
29
- },
30
- "rte": {
31
- "acc": 0.5703971119133574,
32
- "acc_stderr": 0.02979666882912467
33
- },
34
- "winogrande": {
35
- "acc": 0.6148382004735596,
36
- "acc_stderr": 0.013676821287521412
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7295563869588455,
40
- "acc_stderr": 0.010271810373331027
41
- },
42
- "boolq": {
43
- "acc": 0.6412844036697247,
44
- "acc_stderr": 0.0083886680340594
45
- },
46
- "arc_easy": {
47
- "acc": 0.6632996632996633,
48
- "acc_stderr": 0.009697166595752472,
49
- "acc_norm": 0.6397306397306397,
50
- "acc_norm_stderr": 0.00985100258473238
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3310580204778157,
54
- "acc_stderr": 0.01375206241981784,
55
- "acc_norm": 0.3575085324232082,
56
- "acc_norm_stderr": 0.014005494275916573
57
- },
58
- "sciq": {
59
- "acc": 0.916,
60
- "acc_stderr": 0.008776162089491115,
61
- "acc_norm": 0.899,
62
- "acc_norm_stderr": 0.009533618929340987
63
- },
64
- "piqa": {
65
- "acc": 0.7388465723612623,
66
- "acc_stderr": 0.010248738649935578,
67
- "acc_norm": 0.7301414581066377,
68
- "acc_norm_stderr": 0.010356595421852195
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.33,0.014876872027456732,0
3
+ anli_r2,acc,0.33,0.014876872027456736,0
4
+ anli_r3,acc,0.33166666666666667,0.01359683672948516,0
5
+ arc_challenge,acc,0.3378839590443686,0.013822047922283507,0
6
+ arc_challenge,acc_norm,0.3660409556313993,0.014077223108470144,0
7
+ arc_easy,acc,0.6742424242424242,0.009616642976885964,0
8
+ arc_easy,acc_norm,0.6523569023569024,0.009771868846830909,0
9
+ boolq,acc,0.6428134556574924,0.008380743796951404,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.35968427443837275,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4790878311093408,0.004985415250690917,0
14
+ hellaswag,acc_norm,0.6304521011750648,0.004816958817726088,0
15
+ piqa,acc,0.7404787812840044,0.010227939888173918,0
16
+ piqa,acc_norm,0.7388465723612623,0.010248738649935587,0
17
+ rte,acc,0.5703971119133574,0.02979666882912467,0
18
+ sciq,acc,0.923,0.008434580140240644,0
19
+ sciq,acc_norm,0.901,0.009449248027662746,0
20
+ storycloze_2016,acc,0.7365045430251203,0.010187168219156485,0
21
+ winogrande,acc,0.6235201262825573,0.013616931960667187,0
perplexity25/evaluation/rankeval/perplexity25_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.33,
5
- "acc_stderr": 0.014876872027456732
6
- },
7
- "anli_r2": {
8
- "acc": 0.33,
9
- "acc_stderr": 0.014876872027456736
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.01359683672948516
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.35968427443837275
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4790878311093408,
26
- "acc_stderr": 0.004985415250690917,
27
- "acc_norm": 0.6304521011750648,
28
- "acc_norm_stderr": 0.004816958817726088
29
- },
30
- "rte": {
31
- "acc": 0.5703971119133574,
32
- "acc_stderr": 0.02979666882912467
33
- },
34
- "winogrande": {
35
- "acc": 0.6235201262825573,
36
- "acc_stderr": 0.013616931960667187
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7365045430251203,
40
- "acc_stderr": 0.010187168219156485
41
- },
42
- "boolq": {
43
- "acc": 0.6428134556574924,
44
- "acc_stderr": 0.008380743796951404
45
- },
46
- "arc_easy": {
47
- "acc": 0.6742424242424242,
48
- "acc_stderr": 0.009616642976885964,
49
- "acc_norm": 0.6523569023569024,
50
- "acc_norm_stderr": 0.009771868846830909
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3378839590443686,
54
- "acc_stderr": 0.013822047922283507,
55
- "acc_norm": 0.3660409556313993,
56
- "acc_norm_stderr": 0.014077223108470144
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240644,
61
- "acc_norm": 0.901,
62
- "acc_norm_stderr": 0.009449248027662746
63
- },
64
- "piqa": {
65
- "acc": 0.7404787812840044,
66
- "acc_stderr": 0.010227939888173918,
67
- "acc_norm": 0.7388465723612623,
68
- "acc_norm_stderr": 0.010248738649935587
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738859,0
3
+ anli_r2,acc,0.322,0.014782913600996676,0
4
+ anli_r3,acc,0.33166666666666667,0.013596836729485163,0
5
+ arc_challenge,acc,0.318259385665529,0.013611993916971453,0
6
+ arc_challenge,acc_norm,0.35921501706484643,0.01402022415583914,0
7
+ arc_easy,acc,0.6712962962962963,0.009638903167022173,0
8
+ arc_easy,acc_norm,0.6641414141414141,0.009691180932083508,0
9
+ boolq,acc,0.6474006116207951,0.008356412493562119,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.32868937048503616,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.4817765385381398,0.00498646615169878,0
14
+ hellaswag,acc_norm,0.6341366261700856,0.004806870285747301,0
15
+ piqa,acc,0.7431991294885746,0.010192864802278052,0
16
+ piqa,acc_norm,0.7459194776931447,0.010157271999135046,0
17
+ rte,acc,0.5451263537906137,0.029973636495415255,0
18
+ sciq,acc,0.923,0.008434580140240646,0
19
+ sciq,acc_norm,0.917,0.00872852720607479,0
20
+ storycloze_2016,acc,0.7439871726349546,0.010092361160364261,0
21
+ winogrande,acc,0.6290449881610103,0.01357639990223157,0
perplexity25/evaluation/rankeval/perplexity25_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738859
6
- },
7
- "anli_r2": {
8
- "acc": 0.322,
9
- "acc_stderr": 0.014782913600996676
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485163
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.32868937048503616
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.4817765385381398,
26
- "acc_stderr": 0.00498646615169878,
27
- "acc_norm": 0.6341366261700856,
28
- "acc_norm_stderr": 0.004806870285747301
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.029973636495415255
33
- },
34
- "winogrande": {
35
- "acc": 0.6290449881610103,
36
- "acc_stderr": 0.01357639990223157
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7439871726349546,
40
- "acc_stderr": 0.010092361160364261
41
- },
42
- "boolq": {
43
- "acc": 0.6474006116207951,
44
- "acc_stderr": 0.008356412493562119
45
- },
46
- "arc_easy": {
47
- "acc": 0.6712962962962963,
48
- "acc_stderr": 0.009638903167022173,
49
- "acc_norm": 0.6641414141414141,
50
- "acc_norm_stderr": 0.009691180932083508
51
- },
52
- "arc_challenge": {
53
- "acc": 0.318259385665529,
54
- "acc_stderr": 0.013611993916971453,
55
- "acc_norm": 0.35921501706484643,
56
- "acc_norm_stderr": 0.01402022415583914
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240646,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.00872852720607479
63
- },
64
- "piqa": {
65
- "acc": 0.7431991294885746,
66
- "acc_stderr": 0.010192864802278052,
67
- "acc_norm": 0.7459194776931447,
68
- "acc_norm_stderr": 0.010157271999135046
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932575,0
3
+ anli_r2,acc,0.326,0.014830507204541042,0
4
+ anli_r3,acc,0.33666666666666667,0.013647602942406401,0
5
+ arc_challenge,acc,0.32764505119453924,0.013715847940719346,0
6
+ arc_challenge,acc_norm,0.3728668941979522,0.014131176760131163,0
7
+ arc_easy,acc,0.6788720538720538,0.009580787536986797,0
8
+ arc_easy,acc_norm,0.6590909090909091,0.009726579593424019,0
9
+ boolq,acc,0.652599388379205,0.00832781675259947,1
10
+ cb,acc,0.4642857142857143,0.06724777654937658,1
11
+ cb,f1,0.3026891807379612,,1
12
+ copa,acc,0.83,0.037752516806863715,0
13
+ hellaswag,acc,0.4787890858394742,0.004985289555586538,0
14
+ hellaswag,acc_norm,0.6365265883290181,0.004800164434233263,0
15
+ piqa,acc,0.73449401523395,0.010303308653024427,0
16
+ piqa,acc_norm,0.7383025027203483,0.010255630772708227,0
17
+ rte,acc,0.5342960288808665,0.030025579819366426,0
18
+ sciq,acc,0.927,0.008230354715244054,0
19
+ sciq,acc_norm,0.921,0.008534156773333435,0
20
+ storycloze_2016,acc,0.7471940138963121,0.010050543909878572,0
21
+ winogrande,acc,0.6179952644041041,0.013655578215970418,0
perplexity25/evaluation/rankeval/perplexity25_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.335,
5
- "acc_stderr": 0.014933117490932575
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541042
10
- },
11
- "anli_r3": {
12
- "acc": 0.33666666666666667,
13
- "acc_stderr": 0.013647602942406401
14
- },
15
- "cb": {
16
- "acc": 0.4642857142857143,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3026891807379612
19
- },
20
- "copa": {
21
- "acc": 0.83,
22
- "acc_stderr": 0.037752516806863715
23
- },
24
- "hellaswag": {
25
- "acc": 0.4787890858394742,
26
- "acc_stderr": 0.004985289555586538,
27
- "acc_norm": 0.6365265883290181,
28
- "acc_norm_stderr": 0.004800164434233263
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366426
33
- },
34
- "winogrande": {
35
- "acc": 0.6179952644041041,
36
- "acc_stderr": 0.013655578215970418
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7471940138963121,
40
- "acc_stderr": 0.010050543909878572
41
- },
42
- "boolq": {
43
- "acc": 0.652599388379205,
44
- "acc_stderr": 0.00832781675259947
45
- },
46
- "arc_easy": {
47
- "acc": 0.6788720538720538,
48
- "acc_stderr": 0.009580787536986797,
49
- "acc_norm": 0.6590909090909091,
50
- "acc_norm_stderr": 0.009726579593424019
51
- },
52
- "arc_challenge": {
53
- "acc": 0.32764505119453924,
54
- "acc_stderr": 0.013715847940719346,
55
- "acc_norm": 0.3728668941979522,
56
- "acc_norm_stderr": 0.014131176760131163
57
- },
58
- "sciq": {
59
- "acc": 0.927,
60
- "acc_stderr": 0.008230354715244054,
61
- "acc_norm": 0.921,
62
- "acc_norm_stderr": 0.008534156773333435
63
- },
64
- "piqa": {
65
- "acc": 0.73449401523395,
66
- "acc_stderr": 0.010303308653024427,
67
- "acc_norm": 0.7383025027203483,
68
- "acc_norm_stderr": 0.010255630772708227
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity25/evaluation/rankeval/perplexity25_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.342,0.01500870618212173,0
3
+ anli_r2,acc,0.34,0.014987482264363937,0
4
+ anli_r3,acc,0.335,0.013630871843821479,0
5
+ arc_challenge,acc,0.34044368600682595,0.013847460518892981,0
6
+ arc_challenge,acc_norm,0.36945392491467577,0.0141045783664919,0
7
+ arc_easy,acc,0.680976430976431,0.009564133249441074,0
8
+ arc_easy,acc_norm,0.6658249158249159,0.009679106032919058,0
9
+ boolq,acc,0.6501529051987768,0.008341409251946758,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.31573655103066867,,1
12
+ copa,acc,0.83,0.03775251680686371,0
13
+ hellaswag,acc,0.4805815574586736,0.00498601693867853,0
14
+ hellaswag,acc_norm,0.6378211511651065,0.004796478664403837,0
15
+ piqa,acc,0.7383025027203483,0.010255630772708229,0
16
+ piqa,acc_norm,0.735038084874864,0.010296557993316037,0
17
+ rte,acc,0.5487364620938628,0.029953149241808946,0
18
+ sciq,acc,0.927,0.00823035471524406,0
19
+ sciq,acc_norm,0.921,0.008534156773333442,0
20
+ storycloze_2016,acc,0.7493319080705505,0.010022263975606228,0
21
+ winogrande,acc,0.6503551696921863,0.013402073680850508,0
perplexity25/evaluation/rankeval/perplexity25_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.342,
5
- "acc_stderr": 0.01500870618212173
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363937
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821479
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.31573655103066867
19
- },
20
- "copa": {
21
- "acc": 0.83,
22
- "acc_stderr": 0.03775251680686371
23
- },
24
- "hellaswag": {
25
- "acc": 0.4805815574586736,
26
- "acc_stderr": 0.00498601693867853,
27
- "acc_norm": 0.6378211511651065,
28
- "acc_norm_stderr": 0.004796478664403837
29
- },
30
- "rte": {
31
- "acc": 0.5487364620938628,
32
- "acc_stderr": 0.029953149241808946
33
- },
34
- "winogrande": {
35
- "acc": 0.6503551696921863,
36
- "acc_stderr": 0.013402073680850508
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7493319080705505,
40
- "acc_stderr": 0.010022263975606228
41
- },
42
- "boolq": {
43
- "acc": 0.6501529051987768,
44
- "acc_stderr": 0.008341409251946758
45
- },
46
- "arc_easy": {
47
- "acc": 0.680976430976431,
48
- "acc_stderr": 0.009564133249441074,
49
- "acc_norm": 0.6658249158249159,
50
- "acc_norm_stderr": 0.009679106032919058
51
- },
52
- "arc_challenge": {
53
- "acc": 0.34044368600682595,
54
- "acc_stderr": 0.013847460518892981,
55
- "acc_norm": 0.36945392491467577,
56
- "acc_norm_stderr": 0.0141045783664919
57
- },
58
- "sciq": {
59
- "acc": 0.927,
60
- "acc_stderr": 0.00823035471524406,
61
- "acc_norm": 0.921,
62
- "acc_norm_stderr": 0.008534156773333442
63
- },
64
- "piqa": {
65
- "acc": 0.7383025027203483,
66
- "acc_stderr": 0.010255630772708229,
67
- "acc_norm": 0.735038084874864,
68
- "acc_norm_stderr": 0.010296557993316037
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.10058915023610071
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.10058915023610071
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.15977592553077005
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.15977592553077005
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.18872461991388448
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18872461991388448
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2016971233453876
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2016971233453876
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20671915111813988
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20671915111813988
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.20755693489144728
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.20755693489144728
14
+ e2e_nlg_cleaned,5,average,multiple,0.17751048417262166
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05426109744027298
16
+ gem_xsum,0,median,rouge2_fmeasure,0.05426109744027298
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04002835284879521
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04002835284879521
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04546623699775454
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04546623699775454
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04301170567438614
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04301170567438614
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010629103476078454
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010629103476078454
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00022888298907867294
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00022888298907867294
27
+ gem_xsum,5,average,multiple,0.032270896571061
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05636376429781428
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05636376429781428
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05494866999157076
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05494866999157076
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05613328052522034
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05613328052522034
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05780751377987367
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05780751377987367
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0588262051974305
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.0588262051974305
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05914854622489272
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05914854622489272
40
+ web_nlg_en,5,average,multiple,0.057204663336133714
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03242729118049159
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03242729118049159
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.051296962531802784
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.051296962531802784
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05835798348350554
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05835798348350554
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04724388490620084
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04724388490620084
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01535069013326034
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01535069013326034
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002325818245704019
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002325818245704019
53
+ wiki_lingua_en,5,average,multiple,0.034500438413494185
perplexity50/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3841955455859648, "bleu_stderr": 0.03834484804445635, "rouge1_fmeasure": 0.12051532399725066, "rouge1_fmeasure_stderr": 0.002201660726069953, "rouge1_precision": 0.08014992228438296, "rouge1_precision_stderr": 0.0017168290656594826, "rouge1_recall": 0.3261497758575972, "rouge1_recall_stderr": 0.004673915338473915, "rouge2_fmeasure": 0.05636376429781428, "rouge2_fmeasure_stderr": 0.001380678937179018, "rouge2_precision": 0.037046602853886695, "rouge2_precision_stderr": 0.0010494705661268828, "rouge2_recall": 0.1571021450899692, "rouge2_recall_stderr": 0.0033056759391242784, "rougeL_fmeasure": 0.11589156234984295, "rougeL_fmeasure_stderr": 0.002019174904591411, "rougeL_precision": 0.07675495084341123, "rougeL_precision_stderr": 0.0015593801603559256, "rougeL_recall": 0.3167777456526712, "rougeL_recall_stderr": 0.0045215943809518, "rougeLsum_fmeasure": 0.11479926239465861, "rougeLsum_fmeasure_stderr": 0.0020583922218685075, "rougeLsum_precision": 0.07631587685180155, "rougeLsum_precision_stderr": 0.0016101491986833256, "rougeLsum_recall": 0.31168339901047554, "rougeLsum_recall_stderr": 0.004419194664667104}}, "1": {"PALM_prompt": {"bleu": 0.42759204004197143, "bleu_stderr": 0.023184098740316197, "rouge1_fmeasure": 0.11872984928779891, "rouge1_fmeasure_stderr": 0.0019464789074927673, "rouge1_precision": 0.0764954377721011, "rouge1_precision_stderr": 0.0014403924566495617, "rouge1_recall": 0.3699228352820796, "rouge1_recall_stderr": 0.004905399646426089, "rouge2_fmeasure": 0.05494866999157076, "rouge2_fmeasure_stderr": 0.0012146887370210208, "rouge2_precision": 0.03527667126582558, "rouge2_precision_stderr": 0.0008695445202518543, "rouge2_recall": 0.17789638991052, "rouge2_recall_stderr": 0.0035278151906693064, "rougeL_fmeasure": 0.1129384334238869, "rougeL_fmeasure_stderr": 0.0018191890467148604, "rougeL_precision": 0.07272581922783473, "rougeL_precision_stderr": 0.0013349614051097628, "rougeL_recall": 0.34880705084622615, "rougeL_recall_stderr": 0.004490070517958085, "rougeLsum_fmeasure": 0.11356822062600092, "rougeLsum_fmeasure_stderr": 0.0018599619391754974, "rougeLsum_precision": 0.07321976744637436, "rougeLsum_precision_stderr": 0.0013772614644632743, "rougeLsum_recall": 0.3519663428654811, "rougeLsum_recall_stderr": 0.004562356824400081}}, "2": {"PALM_prompt": {"bleu": 0.4856534095490541, "bleu_stderr": 0.02443151487945902, "rouge1_fmeasure": 0.12115437420866133, "rouge1_fmeasure_stderr": 0.0017815531385909358, "rouge1_precision": 0.07721718276217579, "rouge1_precision_stderr": 0.0013119823062237108, "rouge1_recall": 0.39692380063364113, "rouge1_recall_stderr": 0.004838247650813131, "rouge2_fmeasure": 0.05613328052522034, "rouge2_fmeasure_stderr": 0.0011336256501373859, "rouge2_precision": 0.035589537397562925, "rouge2_precision_stderr": 0.0007970484569143277, "rouge2_recall": 0.19548525884545162, "rouge2_recall_stderr": 0.003705977303917903, "rougeL_fmeasure": 0.11469727176936799, "rougeL_fmeasure_stderr": 0.0016974540069808568, "rougeL_precision": 0.07320664477451425, "rougeL_precision_stderr": 0.00124331025548219, "rougeL_recall": 0.36961196243693584, "rougeL_recall_stderr": 0.004355138581855341, "rougeLsum_fmeasure": 0.11594842776998675, "rougeLsum_fmeasure_stderr": 0.0017162153027005642, "rougeLsum_precision": 0.07398228224748984, "rougeLsum_precision_stderr": 0.0012654062337437574, "rougeLsum_recall": 0.3771975381479834, "rougeLsum_recall_stderr": 0.004532808747113653}}, "3": {"PALM_prompt": {"bleu": 0.5126461546548764, "bleu_stderr": 0.023481707085499674, "rouge1_fmeasure": 0.12296680669192865, "rouge1_fmeasure_stderr": 0.001771390257741117, "rouge1_precision": 0.07818118915673905, "rouge1_precision_stderr": 0.001307597764643951, "rouge1_recall": 0.40552198761426156, "rouge1_recall_stderr": 0.004891057939341875, "rouge2_fmeasure": 0.05780751377987367, "rouge2_fmeasure_stderr": 0.0011295131135826688, "rouge2_precision": 0.03650686263182726, "rouge2_precision_stderr": 0.0007906653215721775, "rouge2_recall": 0.2032738453519446, "rouge2_recall_stderr": 0.003736335783154159, "rougeL_fmeasure": 0.11565624884195182, "rougeL_fmeasure_stderr": 0.0016653289676033045, "rougeL_precision": 0.07364405212863913, "rougeL_precision_stderr": 0.0012259573312863687, "rougeL_recall": 0.3755672149905026, "rougeL_recall_stderr": 0.004294385844287458, "rougeLsum_fmeasure": 0.11715743772138398, "rougeLsum_fmeasure_stderr": 0.0016964195102981497, "rougeLsum_precision": 0.07457144104571109, "rougeLsum_precision_stderr": 0.001254395221332194, "rougeLsum_recall": 0.383723870677269, "rougeLsum_recall_stderr": 0.004507602950858476}}, "4": {"PALM_prompt": {"bleu": 0.5768263836110397, "bleu_stderr": 0.033531304625404476, "rouge1_fmeasure": 0.12484492984248315, "rouge1_fmeasure_stderr": 0.0017733677490337002, "rouge1_precision": 0.07912332276140266, "rouge1_precision_stderr": 0.001308722937203902, "rouge1_recall": 0.4163756942667881, "rouge1_recall_stderr": 0.0049142362912846065, "rouge2_fmeasure": 0.0588262051974305, "rouge2_fmeasure_stderr": 0.0011381389114864023, "rouge2_precision": 0.03703974751248662, "rouge2_precision_stderr": 0.000798849362280762, "rouge2_recall": 0.20955667473009687, "rouge2_recall_stderr": 0.003786190119236611, "rougeL_fmeasure": 0.11679920573658832, "rougeL_fmeasure_stderr": 0.0016587330040215838, "rougeL_precision": 0.07414629204331873, "rougeL_precision_stderr": 0.0012180987225705276, "rougeL_recall": 0.3839203236439917, "rougeL_recall_stderr": 0.0043260019772130365, "rougeLsum_fmeasure": 0.11925978856110157, "rougeLsum_fmeasure_stderr": 0.0016998268553025208, "rougeLsum_precision": 0.07566523029296329, "rougeLsum_precision_stderr": 0.0012550312341550222, "rougeLsum_recall": 0.3951330344622166, "rougeLsum_recall_stderr": 0.004544810736094279}}, "5": {"PALM_prompt": {"bleu": 0.6000731585078155, "bleu_stderr": 0.02958850942035399, "rouge1_fmeasure": 0.12519311022366508, "rouge1_fmeasure_stderr": 0.0017687940658429597, "rouge1_precision": 0.07936877002627572, "rouge1_precision_stderr": 0.0013478728108461428, "rouge1_recall": 0.42485523830131044, "rouge1_recall_stderr": 0.004961789167035332, "rouge2_fmeasure": 0.05914854622489272, "rouge2_fmeasure_stderr": 0.001134713521503068, "rouge2_precision": 0.03729344715424889, "rouge2_precision_stderr": 0.0008381379859023699, "rouge2_recall": 0.21558831324782035, "rouge2_recall_stderr": 0.00382217803604959, "rougeL_fmeasure": 0.11624930186573486, "rougeL_fmeasure_stderr": 0.0016499833416766872, "rougeL_precision": 0.07374550717756582, "rougeL_precision_stderr": 0.001229003964772475, "rougeL_recall": 0.3896512196098555, "rougeL_recall_stderr": 0.00438967659644571, "rougeLsum_fmeasure": 0.11877149951275545, "rougeLsum_fmeasure_stderr": 0.0016790431278768587, "rougeLsum_precision": 0.07529523941861134, "rougeLsum_precision_stderr": 0.0012574009028455731, "rougeLsum_recall": 0.4005167169588075, "rougeLsum_recall_stderr": 0.004552442433337306}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.383140911898804, "bleu_stderr": 0.06745292495577788, "rouge1_fmeasure": 0.1673082764887814, "rouge1_fmeasure_stderr": 0.001865906276285246, "rouge1_precision": 0.14252443763236, "rouge1_precision_stderr": 0.0018590735935872559, "rouge1_recall": 0.2433207401530673, "rouge1_recall_stderr": 0.0027216367874257704, "rouge2_fmeasure": 0.03242729118049159, "rouge2_fmeasure_stderr": 0.000822088219708237, "rouge2_precision": 0.027172539820821053, "rouge2_precision_stderr": 0.0007118578268436691, "rouge2_recall": 0.0491985860574383, "rouge2_recall_stderr": 0.0013607577897608023, "rougeL_fmeasure": 0.13361933070882745, "rougeL_fmeasure_stderr": 0.0013437669695864618, "rougeL_precision": 0.11248849161329692, "rougeL_precision_stderr": 0.0013075864685883915, "rougeL_recall": 0.19869623082735052, "rougeL_recall_stderr": 0.0021898356243837697, "rougeLsum_fmeasure": 0.15365517848220148, "rougeLsum_fmeasure_stderr": 0.001684866108354418, "rougeLsum_precision": 0.13063172936064044, "rougeLsum_precision_stderr": 0.0016740540617278972, "rougeLsum_recall": 0.22441355121485976, "rougeLsum_recall_stderr": 0.002501090278748021}}, "1": {"tldr_en": {"bleu": 2.7831136014630777, "bleu_stderr": 0.05823735961787848, "rouge1_fmeasure": 0.2160954413299178, "rouge1_fmeasure_stderr": 0.0019654215171466417, "rouge1_precision": 0.18512017707721384, "rouge1_precision_stderr": 0.0021094437947065357, "rouge1_recall": 0.3158252638107596, "rouge1_recall_stderr": 0.0028417150098632967, "rouge2_fmeasure": 0.051296962531802784, "rouge2_fmeasure_stderr": 0.0010375664756837545, "rouge2_precision": 0.04376467946822878, "rouge2_precision_stderr": 0.0009756253583246246, "rouge2_recall": 0.07778902526231471, "rouge2_recall_stderr": 0.0017144407061461767, "rougeL_fmeasure": 0.15429878943917802, "rougeL_fmeasure_stderr": 0.0013508750340413321, "rougeL_precision": 0.13071111972941768, "rougeL_precision_stderr": 0.0014200606755908336, "rougeL_recall": 0.23169837875792845, "rougeL_recall_stderr": 0.002280199313309875, "rougeLsum_fmeasure": 0.20352880625344108, "rougeLsum_fmeasure_stderr": 0.0018405295838661016, "rougeLsum_precision": 0.17416190886149602, "rougeLsum_precision_stderr": 0.0019763988764514343, "rougeLsum_recall": 0.29830367981123657, "rougeLsum_recall_stderr": 0.0026966353421627575}}, "2": {"tldr_en": {"bleu": 3.2957837666594587, "bleu_stderr": 0.08034379540818389, "rouge1_fmeasure": 0.22869891032831227, "rouge1_fmeasure_stderr": 0.001930285464026905, "rouge1_precision": 0.20719943998381424, "rouge1_precision_stderr": 0.002491870168779193, "rouge1_recall": 0.3275862651091397, "rouge1_recall_stderr": 0.0027741119190619616, "rouge2_fmeasure": 0.05835798348350554, "rouge2_fmeasure_stderr": 0.0011051187337092607, "rouge2_precision": 0.054125628291332856, "rouge2_precision_stderr": 0.0013199800821959524, "rouge2_recall": 0.08630431560199675, "rouge2_recall_stderr": 0.0018086295613497908, "rougeL_fmeasure": 0.16408965991494653, "rougeL_fmeasure_stderr": 0.0013803822196969012, "rougeL_precision": 0.148528573150794, "rougeL_precision_stderr": 0.0018930555426512554, "rougeL_recall": 0.24049407805551332, "rougeL_recall_stderr": 0.002292588701907806, "rougeLsum_fmeasure": 0.21661695161689976, "rougeLsum_fmeasure_stderr": 0.0018306086871906694, "rougeLsum_precision": 0.19626454756871028, "rougeLsum_precision_stderr": 0.0023738870826537344, "rougeLsum_recall": 0.3106148563327418, "rougeLsum_recall_stderr": 0.00264458163926524}}, "3": {"tldr_en": {"bleu": 3.117766006224522, "bleu_stderr": 0.08844168500594013, "rouge1_fmeasure": 0.18494884132953612, "rouge1_fmeasure_stderr": 0.0022949571244732228, "rouge1_precision": 0.1801846981582515, "rouge1_precision_stderr": 0.002966508978680055, "rouge1_recall": 0.25914746469386185, "rouge1_recall_stderr": 0.0033244732230100858, "rouge2_fmeasure": 0.04724388490620084, "rouge2_fmeasure_stderr": 0.0011136170490785228, "rouge2_precision": 0.048034735096925886, "rouge2_precision_stderr": 0.0015464762014200295, "rouge2_recall": 0.06803447603698609, "rouge2_recall_stderr": 0.0017069831079943856, "rougeL_fmeasure": 0.13464181051994895, "rougeL_fmeasure_stderr": 0.0016797153921897026, "rougeL_precision": 0.1324468640336949, "rougeL_precision_stderr": 0.0023342385848593035, "rougeL_recall": 0.19202614173003765, "rougeL_recall_stderr": 0.002624571586313395, "rougeLsum_fmeasure": 0.17539853983167855, "rougeLsum_fmeasure_stderr": 0.002171008535901723, "rougeLsum_precision": 0.17068094752096957, "rougeLsum_precision_stderr": 0.0028086345613548606, "rougeLsum_recall": 0.24641584828527527, "rougeLsum_recall_stderr": 0.0031770926187417964}}, "4": {"tldr_en": {"bleu": 0.5914410987220908, "bleu_stderr": 0.03510174866934131, "rouge1_fmeasure": 0.05827542021821347, "rouge1_fmeasure_stderr": 0.001989638358607021, "rouge1_precision": 0.060746566102344496, "rouge1_precision_stderr": 0.0024802575898741574, "rouge1_recall": 0.0844520531515112, "rouge1_recall_stderr": 0.0029460126839031564, "rouge2_fmeasure": 0.01535069013326034, "rouge2_fmeasure_stderr": 0.0007972694044836853, "rouge2_precision": 0.01708667593672377, "rouge2_precision_stderr": 0.0012309085284382452, "rouge2_recall": 0.023158372631812345, "rouge2_recall_stderr": 0.001255540288620037, "rougeL_fmeasure": 0.043279163481274745, "rougeL_fmeasure_stderr": 0.0014817290632566425, "rougeL_precision": 0.04628144425563542, "rougeL_precision_stderr": 0.0020233599391486622, "rougeL_recall": 0.06365609520147814, "rougeL_recall_stderr": 0.002267970708853241, "rougeLsum_fmeasure": 0.054917664734240174, "rougeLsum_fmeasure_stderr": 0.0018755702048733026, "rougeLsum_precision": 0.0574384376172157, "rougeLsum_precision_stderr": 0.002362476755396724, "rougeLsum_recall": 0.07967217250561419, "rougeLsum_recall_stderr": 0.0027846813063251916}}, "5": {"tldr_en": {"bleu": 4.2206706508824354e-07, "bleu_stderr": 9.033374866710449e-07, "rouge1_fmeasure": 0.008757736475992225, "rouge1_fmeasure_stderr": 0.000830866197712448, "rouge1_precision": 0.009084734687542615, "rouge1_precision_stderr": 0.001035058563997235, "rouge1_recall": 0.013139853227511232, "rouge1_recall_stderr": 0.001287810296386965, "rouge2_fmeasure": 0.002325818245704019, "rouge2_fmeasure_stderr": 0.00031017440648810845, "rouge2_precision": 0.002883518719217727, "rouge2_precision_stderr": 0.0005696461607134438, "rouge2_recall": 0.0035342303209918327, "rouge2_recall_stderr": 0.0005062242713917856, "rougeL_fmeasure": 0.0068130585987481495, "rougeL_fmeasure_stderr": 0.0006449066053870117, "rougeL_precision": 0.007262975272974422, "rougeL_precision_stderr": 0.0008786946312435219, "rougeL_recall": 0.01037090312723551, "rougeL_recall_stderr": 0.0010286097141161677, "rougeLsum_fmeasure": 0.008317350447949961, "rougeLsum_fmeasure_stderr": 0.0007883086779573902, "rougeLsum_precision": 0.008666293877274517, "rougeLsum_precision_stderr": 0.000996353612238138, "rougeLsum_recall": 0.012494692152033124, "rougeLsum_recall_stderr": 0.001221173699832202}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 5.285882687642843, "bleu_stderr": 0.06697426008552022, "rouge1_fmeasure": 0.23951822437486534, "rouge1_fmeasure_stderr": 0.0017048049458819286, "rouge1_precision": 0.19340923170637989, "rouge1_precision_stderr": 0.0016098131373697873, "rouge1_recall": 0.36655908361004397, "rouge1_recall_stderr": 0.002604816098151851, "rouge2_fmeasure": 0.10058915023610071, "rouge2_fmeasure_stderr": 0.0012779590967870102, "rouge2_precision": 0.07638118077243172, "rouge2_precision_stderr": 0.0010271765749693493, "rouge2_recall": 0.16004590783643435, "rouge2_recall_stderr": 0.002018059971951962, "rougeL_fmeasure": 0.21218418778210568, "rougeL_fmeasure_stderr": 0.0014038041647419447, "rougeL_precision": 0.16992379812238415, "rougeL_precision_stderr": 0.0012959255203026498, "rougeL_recall": 0.32762918553762216, "rougeL_recall_stderr": 0.0022741947911686495, "rougeLsum_fmeasure": 0.21087391773473857, "rougeLsum_fmeasure_stderr": 0.0016466676007216732, "rougeLsum_precision": 0.16993592420494444, "rougeLsum_precision_stderr": 0.0015045811195205502, "rougeLsum_recall": 0.32347836326516793, "rougeLsum_recall_stderr": 0.002536061380650285}}, "1": {"generate_text_restaurant": {"bleu": 8.759868718384176, "bleu_stderr": 0.08986207189239839, "rouge1_fmeasure": 0.39045142557557627, "rouge1_fmeasure_stderr": 0.0019753647001218705, "rouge1_precision": 0.3825672121452994, "rouge1_precision_stderr": 0.00234517833697759, "rouge1_recall": 0.4382865336266532, "rouge1_recall_stderr": 0.0027794348254113546, "rouge2_fmeasure": 0.15977592553077005, "rouge2_fmeasure_stderr": 0.0016281989808774435, "rouge2_precision": 0.1562403146993971, "rouge2_precision_stderr": 0.0017105000210557784, "rouge2_recall": 0.18141451294370953, "rouge2_recall_stderr": 0.0020502392167278793, "rougeL_fmeasure": 0.2774015144515331, "rougeL_fmeasure_stderr": 0.0016146206236499572, "rougeL_precision": 0.27150380747664543, "rougeL_precision_stderr": 0.0018690865365730313, "rougeL_recall": 0.31307144412730237, "rougeL_recall_stderr": 0.0022863268336661626, "rougeLsum_fmeasure": 0.32595782319940814, "rougeLsum_fmeasure_stderr": 0.0019248153927043945, "rougeLsum_precision": 0.31953264548547916, "rougeLsum_precision_stderr": 0.002206836566776707, "rougeLsum_recall": 0.3659092525728581, "rougeLsum_recall_stderr": 0.0025888812249551894}}, "2": {"generate_text_restaurant": {"bleu": 10.10327848388186, "bleu_stderr": 0.13416893009812642, "rouge1_fmeasure": 0.4150614272126317, "rouge1_fmeasure_stderr": 0.0019038601009108084, "rouge1_precision": 0.40047122427428994, "rouge1_precision_stderr": 0.0023102636338495812, "rouge1_recall": 0.4691288386223476, "rouge1_recall_stderr": 0.0026887985201774624, "rouge2_fmeasure": 0.18872461991388448, "rouge2_fmeasure_stderr": 0.0017027808165788005, "rouge2_precision": 0.18175344294276033, "rouge2_precision_stderr": 0.0017821897624685544, "rouge2_recall": 0.21581298714587557, "rouge2_recall_stderr": 0.0021699718182286156, "rougeL_fmeasure": 0.3077697188688082, "rougeL_fmeasure_stderr": 0.0016516793472863517, "rougeL_precision": 0.2962608515862683, "rougeL_precision_stderr": 0.001897045239202071, "rougeL_recall": 0.34990861003729856, "rougeL_recall_stderr": 0.0023602136361034433, "rougeLsum_fmeasure": 0.3519730669933395, "rougeLsum_fmeasure_stderr": 0.0019358104213338006, "rougeLsum_precision": 0.3398362419135114, "rougeLsum_precision_stderr": 0.0022378563960927657, "rougeLsum_recall": 0.3975797506250789, "rougeLsum_recall_stderr": 0.0025704790531562126}}, "3": {"generate_text_restaurant": {"bleu": 11.232267102526988, "bleu_stderr": 0.14068760640127415, "rouge1_fmeasure": 0.42815668930435585, "rouge1_fmeasure_stderr": 0.001927157531196204, "rouge1_precision": 0.41817325284774387, "rouge1_precision_stderr": 0.0023053737236325977, "rouge1_recall": 0.4752160858627036, "rouge1_recall_stderr": 0.0027142554814264357, "rouge2_fmeasure": 0.2016971233453876, "rouge2_fmeasure_stderr": 0.0017734485816419255, "rouge2_precision": 0.1964731857257769, "rouge2_precision_stderr": 0.0018466850642394885, "rouge2_recall": 0.22633557348204866, "rouge2_recall_stderr": 0.0022240683617011222, "rougeL_fmeasure": 0.3188281781066986, "rougeL_fmeasure_stderr": 0.0017307512933631892, "rougeL_precision": 0.3108487463908802, "rougeL_precision_stderr": 0.0019544617537378835, "rougeL_recall": 0.3553930435732314, "rougeL_recall_stderr": 0.0024025865771174024, "rougeLsum_fmeasure": 0.3640392540141833, "rougeLsum_fmeasure_stderr": 0.00199772383540772, "rougeLsum_precision": 0.355398185837249, "rougeLsum_precision_stderr": 0.002250685549026275, "rougeLsum_recall": 0.4042983855328548, "rougeLsum_recall_stderr": 0.0026636540624325345}}, "4": {"generate_text_restaurant": {"bleu": 11.52714583973097, "bleu_stderr": 0.16566149983325942, "rouge1_fmeasure": 0.43395966766003585, "rouge1_fmeasure_stderr": 0.001919830319829728, "rouge1_precision": 0.4224736137769234, "rouge1_precision_stderr": 0.002344672239310868, "rouge1_recall": 0.4824757808782152, "rouge1_recall_stderr": 0.002671564917406434, "rouge2_fmeasure": 0.20671915111813988, "rouge2_fmeasure_stderr": 0.001774125867426217, "rouge2_precision": 0.20102260284035095, "rouge2_precision_stderr": 0.0018660070700133957, "rouge2_recall": 0.2320395634175057, "rouge2_recall_stderr": 0.0022223744066551293, "rougeL_fmeasure": 0.32341769588154473, "rougeL_fmeasure_stderr": 0.001729818154828205, "rougeL_precision": 0.31443730192952396, "rougeL_precision_stderr": 0.001986378360109619, "rougeL_recall": 0.3608360400202065, "rougeL_recall_stderr": 0.002374485765520169, "rougeLsum_fmeasure": 0.3693259817282329, "rougeLsum_fmeasure_stderr": 0.002017414187660604, "rougeLsum_precision": 0.35940740482707384, "rougeLsum_precision_stderr": 0.002294856169924013, "rougeLsum_recall": 0.4106478368865337, "rougeLsum_recall_stderr": 0.002652156433027331}}, "5": {"generate_text_restaurant": {"bleu": 11.470373655769501, "bleu_stderr": 0.12527529313499408, "rouge1_fmeasure": 0.4356867279269184, "rouge1_fmeasure_stderr": 0.0019525540843502926, "rouge1_precision": 0.421800370876118, "rouge1_precision_stderr": 0.0023646580883461706, "rouge1_recall": 0.4865640272606994, "rouge1_recall_stderr": 0.002675409006075987, "rouge2_fmeasure": 0.20755693489144728, "rouge2_fmeasure_stderr": 0.001772980726536381, "rouge2_precision": 0.20064053316690955, "rouge2_precision_stderr": 0.0018548156820547273, "rouge2_recall": 0.23383994456326945, "rouge2_recall_stderr": 0.0021887036234265046, "rougeL_fmeasure": 0.32464613664724695, "rougeL_fmeasure_stderr": 0.0017180101489382377, "rougeL_precision": 0.31356023285448703, "rougeL_precision_stderr": 0.0019519642683972734, "rougeL_recall": 0.36423548391706, "rougeL_recall_stderr": 0.0023618289325691875, "rougeLsum_fmeasure": 0.3709229849290071, "rougeLsum_fmeasure_stderr": 0.002028707851697601, "rougeLsum_precision": 0.35931617332832466, "rougeLsum_precision_stderr": 0.00232575064650851, "rougeLsum_recall": 0.4139321110238479, "rougeLsum_recall_stderr": 0.002614052034797568}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.286764120955454, "bleu_stderr": 0.10487671432151699, "rouge1_fmeasure": 0.2170704363655291, "rouge1_fmeasure_stderr": 0.0026009964674546487, "rouge1_precision": 0.16086222686885998, "rouge1_precision_stderr": 0.0022252447270736404, "rouge1_recall": 0.363993757386026, "rouge1_recall_stderr": 0.004447906860355032, "rouge2_fmeasure": 0.05426109744027298, "rouge2_fmeasure_stderr": 0.0017575127883179419, "rouge2_precision": 0.03965743348041025, "rouge2_precision_stderr": 0.0014470240504900656, "rouge2_recall": 0.09447416461397456, "rouge2_recall_stderr": 0.003039878935146692, "rougeL_fmeasure": 0.1645988844863308, "rougeL_fmeasure_stderr": 0.0020230326221713562, "rougeL_precision": 0.12165923066163074, "rougeL_precision_stderr": 0.001738521026475504, "rougeL_recall": 0.27815207821679444, "rougeL_recall_stderr": 0.0036330563038200624, "rougeLsum_fmeasure": 0.17049497991799512, "rougeLsum_fmeasure_stderr": 0.002199479559253552, "rougeLsum_precision": 0.12592899489421042, "rougeLsum_precision_stderr": 0.0018469201981654569, "rougeLsum_recall": 0.28818031436396413, "rougeLsum_recall_stderr": 0.00395186316132106}}, "1": {"article_DOC_summary": {"bleu": 1.601855503412538, "bleu_stderr": 0.07258015457329471, "rouge1_fmeasure": 0.18432241198153712, "rouge1_fmeasure_stderr": 0.0025753000481448644, "rouge1_precision": 0.13106642326274354, "rouge1_precision_stderr": 0.0019193860865765633, "rouge1_recall": 0.3234658508585958, "rouge1_recall_stderr": 0.0043661264753635486, "rouge2_fmeasure": 0.04002835284879521, "rouge2_fmeasure_stderr": 0.0014781883740027449, "rouge2_precision": 0.02822515438220016, "rouge2_precision_stderr": 0.0010434086746188042, "rouge2_recall": 0.07162285378667402, "rouge2_recall_stderr": 0.002714817358725964, "rougeL_fmeasure": 0.14101261823875597, "rougeL_fmeasure_stderr": 0.0019068168040762914, "rougeL_precision": 0.10003470838188194, "rougeL_precision_stderr": 0.0014061413510224204, "rougeL_recall": 0.24915600486179032, "rougeL_recall_stderr": 0.003347425480308083, "rougeLsum_fmeasure": 0.14771150001250982, "rougeLsum_fmeasure_stderr": 0.00211511203484802, "rougeLsum_precision": 0.10480543277054401, "rougeLsum_precision_stderr": 0.0015545955460983404, "rougeLsum_recall": 0.2607965323547307, "rougeLsum_recall_stderr": 0.003724571304451384}}, "2": {"article_DOC_summary": {"bleu": 1.8380438639962695, "bleu_stderr": 0.10811928358813763, "rouge1_fmeasure": 0.19535202200220275, "rouge1_fmeasure_stderr": 0.0026346536005903373, "rouge1_precision": 0.13887280196688523, "rouge1_precision_stderr": 0.001956740885200385, "rouge1_recall": 0.34294827785468185, "rouge1_recall_stderr": 0.004514777263625162, "rouge2_fmeasure": 0.04546623699775454, "rouge2_fmeasure_stderr": 0.001612092783689185, "rouge2_precision": 0.0320190408417148, "rouge2_precision_stderr": 0.0011343925357226905, "rouge2_recall": 0.08177449666512925, "rouge2_recall_stderr": 0.003014469934741339, "rougeL_fmeasure": 0.14811536601585223, "rougeL_fmeasure_stderr": 0.00196180609857125, "rougeL_precision": 0.10508517578596618, "rougeL_precision_stderr": 0.0014455778496582211, "rougeL_recall": 0.26185041154157507, "rougeL_recall_stderr": 0.003505120843276718, "rougeLsum_fmeasure": 0.1571036308870091, "rougeLsum_fmeasure_stderr": 0.0021764703685117513, "rougeLsum_precision": 0.11141636053588261, "rougeLsum_precision_stderr": 0.0015953318320649059, "rougeLsum_recall": 0.27772475736482033, "rougeLsum_recall_stderr": 0.003883573652450414}}, "3": {"article_DOC_summary": {"bleu": 1.8731170479468633, "bleu_stderr": 0.12212530111306877, "rouge1_fmeasure": 0.1850973564810976, "rouge1_fmeasure_stderr": 0.0028685807268665573, "rouge1_precision": 0.13393465093211152, "rouge1_precision_stderr": 0.002178546476915688, "rouge1_recall": 0.3208349496275703, "rouge1_recall_stderr": 0.005017270430039093, "rouge2_fmeasure": 0.04301170567438614, "rouge2_fmeasure_stderr": 0.0016643347985338668, "rouge2_precision": 0.030603540250172215, "rouge2_precision_stderr": 0.0012053283031466103, "rouge2_recall": 0.07656078303232518, "rouge2_recall_stderr": 0.003042910587753952, "rougeL_fmeasure": 0.14087163035130615, "rougeL_fmeasure_stderr": 0.002174721918788832, "rougeL_precision": 0.10190253909758554, "rougeL_precision_stderr": 0.001662230819445601, "rougeL_recall": 0.2452262253929823, "rougeL_recall_stderr": 0.0038665453486106385, "rougeLsum_fmeasure": 0.14807083425641282, "rougeLsum_fmeasure_stderr": 0.0023776983157151816, "rougeLsum_precision": 0.10713139947740546, "rougeLsum_precision_stderr": 0.001810818001072249, "rougeLsum_recall": 0.2577398531586797, "rougeLsum_recall_stderr": 0.004224949592217397}}, "4": {"article_DOC_summary": {"bleu": 0.8072921803610522, "bleu_stderr": 0.13481405520653453, "rouge1_fmeasure": 0.04957113221677667, "rouge1_fmeasure_stderr": 0.002776223278435835, "rouge1_precision": 0.04237213490568202, "rouge1_precision_stderr": 0.002644486260111252, "rouge1_recall": 0.07779789094117907, "rouge1_recall_stderr": 0.004474695319161571, "rouge2_fmeasure": 0.010629103476078454, "rouge2_fmeasure_stderr": 0.0009487229867910699, "rouge2_precision": 0.00882760521824164, "rouge2_precision_stderr": 0.0009353280430989058, "rouge2_recall": 0.017870091729374663, "rouge2_recall_stderr": 0.0016736090478303408, "rougeL_fmeasure": 0.03785078570554781, "rougeL_fmeasure_stderr": 0.002102296416669563, "rougeL_precision": 0.032755367117331305, "rougeL_precision_stderr": 0.0020851660139623363, "rougeL_recall": 0.059623992862197994, "rougeL_recall_stderr": 0.0034445885527070548, "rougeLsum_fmeasure": 0.040118179164564276, "rougeLsum_fmeasure_stderr": 0.002244409483615815, "rougeLsum_precision": 0.03457603039331168, "rougeLsum_precision_stderr": 0.00217697156328438, "rougeLsum_recall": 0.06328733938197696, "rougeLsum_recall_stderr": 0.003695669480428956}}, "5": {"article_DOC_summary": {"bleu": 3.474750506260909e-38, "bleu_stderr": 2.4761840064885677e-32, "rouge1_fmeasure": 0.0022401437562342683, "rouge1_fmeasure_stderr": 0.0007153938341072901, "rouge1_precision": 0.002595925373248815, "rouge1_precision_stderr": 0.0008354910877971658, "rouge1_recall": 0.001999825463947682, "rouge1_recall_stderr": 0.000634288246013444, "rouge2_fmeasure": 0.00022888298907867294, "rouge2_fmeasure_stderr": 0.00012995777833085082, "rouge2_precision": 0.00027170862025409603, "rouge2_precision_stderr": 0.00014974174761724665, "rouge2_recall": 0.00019881893910750738, "rouge2_recall_stderr": 0.00011559008179568124, "rougeL_fmeasure": 0.0014744333692215336, "rougeL_fmeasure_stderr": 0.00042158052355062295, "rougeL_precision": 0.0016970776755226781, "rougeL_precision_stderr": 0.0004959302048426671, "rougeL_recall": 0.001329703638978752, "rougeL_recall_stderr": 0.0003765201577760315, "rougeLsum_fmeasure": 0.0015935490543749545, "rougeLsum_fmeasure_stderr": 0.000460457141424318, "rougeLsum_precision": 0.0018375087990719742, "rougeLsum_precision_stderr": 0.0005450511389629724, "rougeLsum_recall": 0.0014340716678750828, "rougeLsum_recall_stderr": 0.0004083189310550863}}}}
perplexity50/evaluation/rankeval/perplexity50_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.345,0.015039986742055237,0
3
+ anli_r2,acc,0.338,0.014965960710224496,0
4
+ anli_r3,acc,0.36666666666666664,0.013916893275819943,0
5
+ arc_challenge,acc,0.29692832764505117,0.013352025976725223,0
6
+ arc_challenge,acc_norm,0.31569965870307165,0.01358257109581529,0
7
+ arc_easy,acc,0.6355218855218855,0.009875729282482438,0
8
+ arc_easy,acc_norm,0.5563973063973064,0.010194308914521133,0
9
+ boolq,acc,0.6018348623853211,0.008561755594317447,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.23085585585585586,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4962158932483569,0.004989638507409924,0
14
+ hellaswag,acc_norm,0.6558454491137223,0.004741208229092877,0
15
+ piqa,acc,0.7573449401523396,0.010002002569708698,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.47653429602888087,0.03006330041190266,0
18
+ sciq,acc,0.846,0.011419913065098703,0
19
+ sciq,acc_norm,0.775,0.013211720158614751,0
20
+ storycloze_2016,acc,0.7338321753073223,0.010220104800551206,0
21
+ winogrande,acc,0.5935280189423836,0.01380444869775337,0
perplexity50/evaluation/rankeval/perplexity50_0_lm-eval_global_step80108_2023-05-13-09-53-07_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.345,
5
- "acc_stderr": 0.015039986742055237
6
- },
7
- "anli_r2": {
8
- "acc": 0.338,
9
- "acc_stderr": 0.014965960710224496
10
- },
11
- "anli_r3": {
12
- "acc": 0.36666666666666664,
13
- "acc_stderr": 0.013916893275819943
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.23085585585585586
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4962158932483569,
26
- "acc_stderr": 0.004989638507409924,
27
- "acc_norm": 0.6558454491137223,
28
- "acc_norm_stderr": 0.004741208229092877
29
- },
30
- "rte": {
31
- "acc": 0.47653429602888087,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.5935280189423836,
36
- "acc_stderr": 0.01380444869775337
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7338321753073223,
40
- "acc_stderr": 0.010220104800551206
41
- },
42
- "boolq": {
43
- "acc": 0.6018348623853211,
44
- "acc_stderr": 0.008561755594317447
45
- },
46
- "arc_easy": {
47
- "acc": 0.6355218855218855,
48
- "acc_stderr": 0.009875729282482438,
49
- "acc_norm": 0.5563973063973064,
50
- "acc_norm_stderr": 0.010194308914521133
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29692832764505117,
54
- "acc_stderr": 0.013352025976725223,
55
- "acc_norm": 0.31569965870307165,
56
- "acc_norm_stderr": 0.01358257109581529
57
- },
58
- "sciq": {
59
- "acc": 0.846,
60
- "acc_stderr": 0.011419913065098703,
61
- "acc_norm": 0.775,
62
- "acc_norm_stderr": 0.013211720158614751
63
- },
64
- "piqa": {
65
- "acc": 0.7573449401523396,
66
- "acc_stderr": 0.010002002569708698,
67
- "acc_norm": 0.7633297062023939,
68
- "acc_norm_stderr": 0.009916841655042809
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.339,0.014976758771620347,0
3
+ anli_r2,acc,0.313,0.014671272822977886,0
4
+ anli_r3,acc,0.3425,0.013704669762934727,0
5
+ arc_challenge,acc,0.3122866894197952,0.013542598541688065,0
6
+ arc_challenge,acc_norm,0.3412969283276451,0.01385583128749772,0
7
+ arc_easy,acc,0.6439393939393939,0.009825454608416316,0
8
+ arc_easy,acc_norm,0.6035353535353535,0.010037412763064529,0
9
+ boolq,acc,0.6131498470948012,0.008518188340844762,1
10
+ cb,acc,0.5,0.06741998624632421,1
11
+ cb,f1,0.35220125786163514,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.49502091216889066,0.004989533998820352,0
14
+ hellaswag,acc_norm,0.6565425214100776,0.0047389206247244785,0
15
+ piqa,acc,0.766050054406964,0.009877236895137455,0
16
+ piqa,acc_norm,0.7622415669205659,0.009932525779525492,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.909,0.009099549538400236,0
19
+ sciq,acc_norm,0.89,0.009899393819724442,0
20
+ storycloze_2016,acc,0.7354355959380011,0.010200400541714168,0
21
+ winogrande,acc,0.6053670086819258,0.013736915172371885,0
perplexity50/evaluation/rankeval/perplexity50_1_lm-eval_global_step80108_2023-05-13-09-53-07_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.339,
5
- "acc_stderr": 0.014976758771620347
6
- },
7
- "anli_r2": {
8
- "acc": 0.313,
9
- "acc_stderr": 0.014671272822977886
10
- },
11
- "anli_r3": {
12
- "acc": 0.3425,
13
- "acc_stderr": 0.013704669762934727
14
- },
15
- "cb": {
16
- "acc": 0.5,
17
- "acc_stderr": 0.06741998624632421,
18
- "f1": 0.35220125786163514
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.49502091216889066,
26
- "acc_stderr": 0.004989533998820352,
27
- "acc_norm": 0.6565425214100776,
28
- "acc_norm_stderr": 0.0047389206247244785
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.029973636495415252
33
- },
34
- "winogrande": {
35
- "acc": 0.6053670086819258,
36
- "acc_stderr": 0.013736915172371885
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7354355959380011,
40
- "acc_stderr": 0.010200400541714168
41
- },
42
- "boolq": {
43
- "acc": 0.6131498470948012,
44
- "acc_stderr": 0.008518188340844762
45
- },
46
- "arc_easy": {
47
- "acc": 0.6439393939393939,
48
- "acc_stderr": 0.009825454608416316,
49
- "acc_norm": 0.6035353535353535,
50
- "acc_norm_stderr": 0.010037412763064529
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3122866894197952,
54
- "acc_stderr": 0.013542598541688065,
55
- "acc_norm": 0.3412969283276451,
56
- "acc_norm_stderr": 0.01385583128749772
57
- },
58
- "sciq": {
59
- "acc": 0.909,
60
- "acc_stderr": 0.009099549538400236,
61
- "acc_norm": 0.89,
62
- "acc_norm_stderr": 0.009899393819724442
63
- },
64
- "piqa": {
65
- "acc": 0.766050054406964,
66
- "acc_stderr": 0.009877236895137455,
67
- "acc_norm": 0.7622415669205659,
68
- "acc_norm_stderr": 0.009932525779525492
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.315,0.0146966319607925,0
3
+ anli_r2,acc,0.335,0.014933117490932575,0
4
+ anli_r3,acc,0.34,0.013680495725767789,0
5
+ arc_challenge,acc,0.3293515358361775,0.013734057652635473,0
6
+ arc_challenge,acc_norm,0.3387372013651877,0.01383056892797433,0
7
+ arc_easy,acc,0.6523569023569024,0.00977186884683091,0
8
+ arc_easy,acc_norm,0.6266835016835017,0.009925009142802893,0
9
+ boolq,acc,0.6217125382262997,0.008482001133930994,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.2854808590102708,,1
12
+ copa,acc,0.84,0.03684529491774709,0
13
+ hellaswag,acc,0.4934276040629357,0.004989350311751647,0
14
+ hellaswag,acc_norm,0.6552479585739892,0.004743160034271143,0
15
+ piqa,acc,0.7627856365614799,0.00992469493358637,0
16
+ piqa,acc_norm,0.7709466811751904,0.009804509865175505,0
17
+ rte,acc,0.5018050541516246,0.030096267148976626,0
18
+ sciq,acc,0.916,0.008776162089491132,0
19
+ sciq,acc_norm,0.892,0.009820001651345682,0
20
+ storycloze_2016,acc,0.7413148049171566,0.010126662138021712,0
21
+ winogrande,acc,0.6156274664561957,0.013671567600836192,0
perplexity50/evaluation/rankeval/perplexity50_2_lm-eval_global_step80108_2023-05-13-09-53-07_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.315,
5
- "acc_stderr": 0.0146966319607925
6
- },
7
- "anli_r2": {
8
- "acc": 0.335,
9
- "acc_stderr": 0.014933117490932575
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767789
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.2854808590102708
19
- },
20
- "copa": {
21
- "acc": 0.84,
22
- "acc_stderr": 0.03684529491774709
23
- },
24
- "hellaswag": {
25
- "acc": 0.4934276040629357,
26
- "acc_stderr": 0.004989350311751647,
27
- "acc_norm": 0.6552479585739892,
28
- "acc_norm_stderr": 0.004743160034271143
29
- },
30
- "rte": {
31
- "acc": 0.5018050541516246,
32
- "acc_stderr": 0.030096267148976626
33
- },
34
- "winogrande": {
35
- "acc": 0.6156274664561957,
36
- "acc_stderr": 0.013671567600836192
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7413148049171566,
40
- "acc_stderr": 0.010126662138021712
41
- },
42
- "boolq": {
43
- "acc": 0.6217125382262997,
44
- "acc_stderr": 0.008482001133930994
45
- },
46
- "arc_easy": {
47
- "acc": 0.6523569023569024,
48
- "acc_stderr": 0.00977186884683091,
49
- "acc_norm": 0.6266835016835017,
50
- "acc_norm_stderr": 0.009925009142802893
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3293515358361775,
54
- "acc_stderr": 0.013734057652635473,
55
- "acc_norm": 0.3387372013651877,
56
- "acc_norm_stderr": 0.01383056892797433
57
- },
58
- "sciq": {
59
- "acc": 0.916,
60
- "acc_stderr": 0.008776162089491132,
61
- "acc_norm": 0.892,
62
- "acc_norm_stderr": 0.009820001651345682
63
- },
64
- "piqa": {
65
- "acc": 0.7627856365614799,
66
- "acc_stderr": 0.00992469493358637,
67
- "acc_norm": 0.7709466811751904,
68
- "acc_norm_stderr": 0.009804509865175505
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.0147586523035749,0
3
+ anli_r2,acc,0.335,0.014933117490932572,0
4
+ anli_r3,acc,0.3641666666666667,0.013896714966807255,0
5
+ arc_challenge,acc,0.3199658703071672,0.013631345807016196,0
6
+ arc_challenge,acc_norm,0.3438566552901024,0.013880644570156217,0
7
+ arc_easy,acc,0.6531986531986532,0.009766326091716007,0
8
+ arc_easy,acc_norm,0.6418350168350169,0.009838331651451853,0
9
+ boolq,acc,0.6275229357798165,0.008455846866956081,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.3114930182599356,,1
12
+ copa,acc,0.86,0.03487350880197771,0
13
+ hellaswag,acc,0.4963154750049791,0.004989645929811442,0
14
+ hellaswag,acc_norm,0.6570404301931886,0.004737279691036198,0
15
+ piqa,acc,0.7595212187159956,0.009971345364651076,0
16
+ piqa,acc_norm,0.7671381936887922,0.009861236071080751,0
17
+ rte,acc,0.5234657039711191,0.03006330041190266,0
18
+ sciq,acc,0.916,0.008776162089491139,0
19
+ sciq,acc_norm,0.91,0.009054390204866442,0
20
+ storycloze_2016,acc,0.7466595403527525,0.010057563497401457,0
21
+ winogrande,acc,0.6045777426992897,0.013741678387545345,0
perplexity50/evaluation/rankeval/perplexity50_3_lm-eval_global_step80108_2023-05-13-09-53-07_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.0147586523035749
6
- },
7
- "anli_r2": {
8
- "acc": 0.335,
9
- "acc_stderr": 0.014933117490932572
10
- },
11
- "anli_r3": {
12
- "acc": 0.3641666666666667,
13
- "acc_stderr": 0.013896714966807255
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.3114930182599356
19
- },
20
- "copa": {
21
- "acc": 0.86,
22
- "acc_stderr": 0.03487350880197771
23
- },
24
- "hellaswag": {
25
- "acc": 0.4963154750049791,
26
- "acc_stderr": 0.004989645929811442,
27
- "acc_norm": 0.6570404301931886,
28
- "acc_norm_stderr": 0.004737279691036198
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.03006330041190266
33
- },
34
- "winogrande": {
35
- "acc": 0.6045777426992897,
36
- "acc_stderr": 0.013741678387545345
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7466595403527525,
40
- "acc_stderr": 0.010057563497401457
41
- },
42
- "boolq": {
43
- "acc": 0.6275229357798165,
44
- "acc_stderr": 0.008455846866956081
45
- },
46
- "arc_easy": {
47
- "acc": 0.6531986531986532,
48
- "acc_stderr": 0.009766326091716007,
49
- "acc_norm": 0.6418350168350169,
50
- "acc_norm_stderr": 0.009838331651451853
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3199658703071672,
54
- "acc_stderr": 0.013631345807016196,
55
- "acc_norm": 0.3438566552901024,
56
- "acc_norm_stderr": 0.013880644570156217
57
- },
58
- "sciq": {
59
- "acc": 0.916,
60
- "acc_stderr": 0.008776162089491139,
61
- "acc_norm": 0.91,
62
- "acc_norm_stderr": 0.009054390204866442
63
- },
64
- "piqa": {
65
- "acc": 0.7595212187159956,
66
- "acc_stderr": 0.009971345364651076,
67
- "acc_norm": 0.7671381936887922,
68
- "acc_norm_stderr": 0.009861236071080751
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.324,0.014806864733738863,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.35083333333333333,0.013782212417178195,0
5
+ arc_challenge,acc,0.3293515358361775,0.013734057652635473,0
6
+ arc_challenge,acc_norm,0.3430034129692833,0.013872423223718169,0
7
+ arc_easy,acc,0.664983164983165,0.009685160765932357,0
8
+ arc_easy,acc_norm,0.6506734006734006,0.009782853449399293,0
9
+ boolq,acc,0.618960244648318,0.008493937524439337,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.39031339031339024,,1
12
+ copa,acc,0.84,0.0368452949177471,0
13
+ hellaswag,acc,0.4969129655447122,0.004989686307484557,0
14
+ hellaswag,acc_norm,0.6625174268074089,0.004718846448021788,0
15
+ piqa,acc,0.7627856365614799,0.00992469493358637,0
16
+ piqa,acc_norm,0.7622415669205659,0.009932525779525492,0
17
+ rte,acc,0.49458483754512633,0.030094698123239966,0
18
+ sciq,acc,0.924,0.008384169266796387,0
19
+ sciq,acc_norm,0.918,0.008680515615523745,0
20
+ storycloze_2016,acc,0.7471940138963121,0.010050543909878586,0
21
+ winogrande,acc,0.6298342541436464,0.013570454689603911,0
perplexity50/evaluation/rankeval/perplexity50_4_lm-eval_global_step80108_2023-05-13-09-53-07_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.324,
5
- "acc_stderr": 0.014806864733738863
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.35083333333333333,
13
- "acc_stderr": 0.013782212417178195
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942395,
18
- "f1": 0.39031339031339024
19
- },
20
- "copa": {
21
- "acc": 0.84,
22
- "acc_stderr": 0.0368452949177471
23
- },
24
- "hellaswag": {
25
- "acc": 0.4969129655447122,
26
- "acc_stderr": 0.004989686307484557,
27
- "acc_norm": 0.6625174268074089,
28
- "acc_norm_stderr": 0.004718846448021788
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.030094698123239966
33
- },
34
- "winogrande": {
35
- "acc": 0.6298342541436464,
36
- "acc_stderr": 0.013570454689603911
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7471940138963121,
40
- "acc_stderr": 0.010050543909878586
41
- },
42
- "boolq": {
43
- "acc": 0.618960244648318,
44
- "acc_stderr": 0.008493937524439337
45
- },
46
- "arc_easy": {
47
- "acc": 0.664983164983165,
48
- "acc_stderr": 0.009685160765932357,
49
- "acc_norm": 0.6506734006734006,
50
- "acc_norm_stderr": 0.009782853449399293
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3293515358361775,
54
- "acc_stderr": 0.013734057652635473,
55
- "acc_norm": 0.3430034129692833,
56
- "acc_norm_stderr": 0.013872423223718169
57
- },
58
- "sciq": {
59
- "acc": 0.924,
60
- "acc_stderr": 0.008384169266796387,
61
- "acc_norm": 0.918,
62
- "acc_norm_stderr": 0.008680515615523745
63
- },
64
- "piqa": {
65
- "acc": 0.7627856365614799,
66
- "acc_stderr": 0.00992469493358637,
67
- "acc_norm": 0.7622415669205659,
68
- "acc_norm_stderr": 0.009932525779525492
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
perplexity50/evaluation/rankeval/perplexity50_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.014842213153411242,0
3
+ anli_r2,acc,0.329,0.014865395385928359,0
4
+ anli_r3,acc,0.35083333333333333,0.013782212417178193,0
5
+ arc_challenge,acc,0.3302047781569966,0.013743085603760427,0
6
+ arc_challenge,acc_norm,0.3464163822525597,0.013905011180063246,0
7
+ arc_easy,acc,0.6662457912457912,0.009676065683575472,0
8
+ arc_easy,acc_norm,0.656986531986532,0.009740965666489234,0
9
+ boolq,acc,0.6204892966360857,0.00848734197575683,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.3985557884928954,,1
12
+ copa,acc,0.84,0.03684529491774709,0
13
+ hellaswag,acc,0.49422425811591314,0.004989448490164432,0
14
+ hellaswag,acc_norm,0.6600278828918542,0.004727312448892851,0
15
+ piqa,acc,0.7573449401523396,0.010002002569708698,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.5234657039711191,0.030063300411902652,0
18
+ sciq,acc,0.922,0.008484573530118585,0
19
+ sciq,acc_norm,0.922,0.008484573530118585,0
20
+ storycloze_2016,acc,0.7487974345269909,0.010029364281941636,0
21
+ winogrande,acc,0.6203630623520127,0.013639245403711156,0
perplexity50/evaluation/rankeval/perplexity50_5_lm-eval_global_step80108_2023-05-13-09-53-07_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.327,
5
- "acc_stderr": 0.014842213153411242
6
- },
7
- "anli_r2": {
8
- "acc": 0.329,
9
- "acc_stderr": 0.014865395385928359
10
- },
11
- "anli_r3": {
12
- "acc": 0.35083333333333333,
13
- "acc_stderr": 0.013782212417178193
14
- },
15
- "cb": {
16
- "acc": 0.5714285714285714,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.3985557884928954
19
- },
20
- "copa": {
21
- "acc": 0.84,
22
- "acc_stderr": 0.03684529491774709
23
- },
24
- "hellaswag": {
25
- "acc": 0.49422425811591314,
26
- "acc_stderr": 0.004989448490164432,
27
- "acc_norm": 0.6600278828918542,
28
- "acc_norm_stderr": 0.004727312448892851
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.030063300411902652
33
- },
34
- "winogrande": {
35
- "acc": 0.6203630623520127,
36
- "acc_stderr": 0.013639245403711156
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7487974345269909,
40
- "acc_stderr": 0.010029364281941636
41
- },
42
- "boolq": {
43
- "acc": 0.6204892966360857,
44
- "acc_stderr": 0.00848734197575683
45
- },
46
- "arc_easy": {
47
- "acc": 0.6662457912457912,
48
- "acc_stderr": 0.009676065683575472,
49
- "acc_norm": 0.656986531986532,
50
- "acc_norm_stderr": 0.009740965666489234
51
- },
52
- "arc_challenge": {
53
- "acc": 0.3302047781569966,
54
- "acc_stderr": 0.013743085603760427,
55
- "acc_norm": 0.3464163822525597,
56
- "acc_norm_stderr": 0.013905011180063246
57
- },
58
- "sciq": {
59
- "acc": 0.922,
60
- "acc_stderr": 0.008484573530118585,
61
- "acc_norm": 0.922,
62
- "acc_norm_stderr": 0.008484573530118585
63
- },
64
- "piqa": {
65
- "acc": 0.7573449401523396,
66
- "acc_stderr": 0.010002002569708698,
67
- "acc_norm": 0.7633297062023939,
68
- "acc_norm_stderr": 0.009916841655042809
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }