RefalMachine commited on
Commit
f3eb7a5
·
verified ·
1 Parent(s): c67c979

Upload folder using huggingface_hub

Browse files
llmtf_eval_k5/daru_treewayabstractive.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de15995ebc4d84f552b2ef27f6a024c14c8231b53d4deec16bc2c0d1650e2d10
3
- size 13295390
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d93db6d5f2e0beb71e93b3c3e9f9efa49e96cd5f297e2724f773fdc9e8f5fa05
3
+ size 13299733
llmtf_eval_k5/daru_treewayabstractive_total.jsonl CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "task_name": "daru/treewayabstractive",
3
  "results": {
4
- "rouge1": 0.35742299153264667,
5
- "rouge2": 0.14485242187705508
6
  },
7
- "leaderboard_result": 0.25113770670485086
8
  }
 
1
  {
2
  "task_name": "daru/treewayabstractive",
3
  "results": {
4
+ "rouge1": 0.3516549334515103,
5
+ "rouge2": 0.1390946104887656
6
  },
7
+ "leaderboard_result": 0.24537477197013793
8
  }
llmtf_eval_k5/darumeru_MultiQ.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a30b4651bbea85fcfb218ce22c9b9d043be14b0465dfbe3b586a5bb2415da24d
3
- size 21502472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdde30c01033edbb6b8a4dd39e123b5d19e7b5432f085684a3f04c89d1869639
3
+ size 21502442
llmtf_eval_k5/darumeru_MultiQ_total.jsonl CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "task_name": "darumeru/MultiQ",
3
  "results": {
4
- "f1": 0.5675109413637138,
5
- "em": 0.4655831739961759
6
  },
7
- "leaderboard_result": 0.5165470576799449
8
  }
 
1
  {
2
  "task_name": "darumeru/MultiQ",
3
  "results": {
4
+ "f1": 0.5648542039178045,
5
+ "em": 0.4608030592734226
6
  },
7
+ "leaderboard_result": 0.5128286315956135
8
  }
llmtf_eval_k5/darumeru_USE.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d6f503952d2f77ca7c0565a8ee72198e10c5de06ed0692811f4c82bf5a28f3e
3
- size 10594252
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16bd4cce409dfb12a2777c79864985ae9fccf8a693744e58be09168abb6f45c
3
+ size 10593793
llmtf_eval_k5/darumeru_USE_total.jsonl CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "task_name": "darumeru/USE",
3
  "results": {
4
- "grade_norm": 0.1019607843137255
5
  },
6
- "leaderboard_result": 0.1019607843137255
7
  }
 
1
  {
2
  "task_name": "darumeru/USE",
3
  "results": {
4
+ "grade_norm": 0.10588235294117647
5
  },
6
+ "leaderboard_result": 0.10588235294117647
7
  }
llmtf_eval_k5/darumeru_cp_para_en.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_para_en_total.jsonl CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "task_name": "darumeru/cp_para_en",
3
  "results": {
4
- "symbol_per_token": 4.484972311760252,
5
- "len": 0.999859659310879,
6
- "lcs": 0.9881793213641535
7
  },
8
- "leaderboard_result": 0.9881793213641535
9
  }
 
1
  {
2
  "task_name": "darumeru/cp_para_en",
3
  "results": {
4
+ "symbol_per_token": 4.528028725817485,
5
+ "len": 0.9872908812117563,
6
+ "lcs": 0.9883058202112522
7
  },
8
+ "leaderboard_result": 0.9883058202112522
9
  }
llmtf_eval_k5/darumeru_cp_para_ru.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_para_ru_total.jsonl CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "task_name": "darumeru/cp_para_ru",
3
  "results": {
4
- "symbol_per_token": 2.965935841307524,
5
- "len": 0.9998800850030358,
6
- "lcs": 0.9964476909825747
7
  },
8
- "leaderboard_result": 0.9964476909825747
9
  }
 
1
  {
2
  "task_name": "darumeru/cp_para_ru",
3
  "results": {
4
+ "symbol_per_token": 2.9865072713630245,
5
+ "len": 0.989199175688307,
6
+ "lcs": 0.9976086956521739
7
  },
8
+ "leaderboard_result": 0.9976086956521739
9
  }
llmtf_eval_k5/darumeru_cp_sent_en.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_sent_en_total.jsonl CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "task_name": "darumeru/cp_sent_en",
3
  "results": {
4
- "symbol_per_token": 4.424142837938139,
5
- "len": 0.9984438516260162,
6
- "lcs": 0.9974371974918181
7
  },
8
- "leaderboard_result": 0.9984438516260162
9
  }
 
1
  {
2
  "task_name": "darumeru/cp_sent_en",
3
  "results": {
4
+ "symbol_per_token": 4.556837515131998,
5
+ "len": 0.9592170801454492,
6
+ "lcs": 0.9978536640150768
7
  },
8
+ "leaderboard_result": 0.9592170801454492
9
  }
llmtf_eval_k5/darumeru_cp_sent_ru.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_sent_ru_total.jsonl CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "task_name": "darumeru/cp_sent_ru",
3
  "results": {
4
- "symbol_per_token": 2.8238989214765224,
5
- "len": 0.9998130402818972,
6
- "lcs": 0.9997733257303255
7
  },
8
- "leaderboard_result": 0.9998130402818972
9
  }
 
1
  {
2
  "task_name": "darumeru/cp_sent_ru",
3
  "results": {
4
+ "symbol_per_token": 2.886186230509937,
5
+ "len": 0.9638393987832617,
6
+ "lcs": 0.9997711394078869
7
  },
8
+ "leaderboard_result": 0.9638393987832617
9
  }
llmtf_eval_k5/evaluation_log.txt CHANGED
@@ -623,3 +623,276 @@ INFO: 2024-07-14 12:29:54,874: llmtf.base.evaluator: Ended eval
623
  INFO: 2024-07-14 12:29:54,887: llmtf.base.evaluator:
624
  mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
625
  0.621 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.988 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  INFO: 2024-07-14 12:29:54,887: llmtf.base.evaluator:
624
  mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
625
  0.621 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.988 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
626
+ INFO: 2024-07-14 14:12:57,969: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/ruopenbookqa', 'darumeru/rutie', 'darumeru/ruworldtree', 'darumeru/rwsd', 'darumeru/use', 'russiannlp/rucola_custom']
627
+ INFO: 2024-07-14 14:12:57,971: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
628
+ INFO: 2024-07-14 14:12:57,971: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
629
+ INFO: 2024-07-14 14:12:58,529: llmtf.base.evaluator: Starting eval on ['darumeru/rummlu']
630
+ INFO: 2024-07-14 14:12:58,529: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
631
+ INFO: 2024-07-14 14:12:58,529: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
632
+ INFO: 2024-07-14 14:12:59,265: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
633
+ INFO: 2024-07-14 14:12:59,266: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
634
+ INFO: 2024-07-14 14:12:59,266: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
635
+ INFO: 2024-07-14 14:13:01,346: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
636
+ INFO: 2024-07-14 14:13:01,347: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
637
+ INFO: 2024-07-14 14:13:01,347: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
638
+ INFO: 2024-07-14 14:13:04,501: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
639
+ INFO: 2024-07-14 14:13:04,501: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
640
+ INFO: 2024-07-14 14:13:04,501: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
641
+ INFO: 2024-07-14 14:13:05,549: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
642
+ INFO: 2024-07-14 14:13:05,549: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
643
+ INFO: 2024-07-14 14:13:05,549: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
644
+ INFO: 2024-07-14 14:13:07,592: llmtf.base.evaluator: Starting eval on ['darumeru/cp_sent_ru', 'darumeru/cp_sent_en', 'darumeru/cp_para_ru', 'darumeru/cp_para_en']
645
+ INFO: 2024-07-14 14:13:07,592: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
646
+ INFO: 2024-07-14 14:13:07,592: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
647
+ INFO: 2024-07-14 14:13:12,035: llmtf.base.darumeru/cp_sent_ru: Loading Dataset: 4.44s
648
+ INFO: 2024-07-14 14:13:17,777: llmtf.base.daru/treewayextractive: Loading Dataset: 12.23s
649
+ INFO: 2024-07-14 14:13:18,790: llmtf.base.darumeru/MultiQ: Loading Dataset: 20.82s
650
+ INFO: 2024-07-14 14:13:21,041: llmtf.base.daru/treewayabstractive: Loading Dataset: 16.54s
651
+ INFO: 2024-07-14 14:14:23,369: llmtf.base.darumeru/ruMMLU: Loading Dataset: 84.84s
652
+ INFO: 2024-07-14 14:16:24,369: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 203.02s
653
+ INFO: 2024-07-14 14:17:09,995: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 250.73s
654
+ INFO: 2024-07-14 14:19:20,099: llmtf.base.darumeru/MultiQ: Processing Dataset: 361.28s
655
+ INFO: 2024-07-14 14:19:20,100: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
656
+ INFO: 2024-07-14 14:19:20,106: llmtf.base.darumeru/MultiQ: {'f1': 0.5648542039178045, 'em': 0.4608030592734226}
657
+ INFO: 2024-07-14 14:19:20,117: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
658
+ INFO: 2024-07-14 14:19:20,117: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
659
+ INFO: 2024-07-14 14:19:23,169: llmtf.base.darumeru/PARus: Loading Dataset: 3.05s
660
+ INFO: 2024-07-14 14:19:35,315: llmtf.base.darumeru/PARus: Processing Dataset: 12.09s
661
+ INFO: 2024-07-14 14:19:35,318: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
662
+ INFO: 2024-07-14 14:19:35,332: llmtf.base.darumeru/PARus: {'acc': 0.77}
663
+ INFO: 2024-07-14 14:19:35,333: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
664
+ INFO: 2024-07-14 14:19:35,334: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
665
+ INFO: 2024-07-14 14:19:39,019: llmtf.base.darumeru/cp_sent_ru: Processing Dataset: 386.98s
666
+ INFO: 2024-07-14 14:19:39,021: llmtf.base.darumeru/cp_sent_ru: Results for darumeru/cp_sent_ru:
667
+ INFO: 2024-07-14 14:19:39,026: llmtf.base.darumeru/cp_sent_ru: {'symbol_per_token': 2.886186230509937, 'len': 0.9638393987832617, 'lcs': 0.9997711394078869}
668
+ INFO: 2024-07-14 14:19:39,030: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
669
+ INFO: 2024-07-14 14:19:39,030: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
670
+ INFO: 2024-07-14 14:19:39,650: llmtf.base.darumeru/RCB: Loading Dataset: 4.32s
671
+ INFO: 2024-07-14 14:19:43,492: llmtf.base.darumeru/cp_sent_en: Loading Dataset: 4.46s
672
+ INFO: 2024-07-14 14:19:51,430: llmtf.base.daru/treewayextractive: Processing Dataset: 393.65s
673
+ INFO: 2024-07-14 14:19:51,431: llmtf.base.daru/treewayextractive: Results for daru/treewayextractive:
674
+ INFO: 2024-07-14 14:19:51,654: llmtf.base.daru/treewayextractive: {'r-prec': 0.4072662337662338}
675
+ INFO: 2024-07-14 14:19:51,698: llmtf.base.evaluator: Ended eval
676
+ INFO: 2024-07-14 14:19:51,742: llmtf.base.evaluator:
677
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
678
+ 0.619 0.251 0.407 0.513 0.770 0.412 0.490 0.102 0.988 0.996 0.998 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
679
+ INFO: 2024-07-14 14:20:00,600: llmtf.base.darumeru/RCB: Processing Dataset: 20.95s
680
+ INFO: 2024-07-14 14:20:00,602: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
681
+ INFO: 2024-07-14 14:20:00,610: llmtf.base.darumeru/RCB: {'acc': 0.41363636363636364, 'f1_macro': 0.4105113251051133}
682
+ INFO: 2024-07-14 14:20:00,612: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
683
+ INFO: 2024-07-14 14:20:00,612: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
684
+ INFO: 2024-07-14 14:20:15,143: llmtf.base.darumeru/ruOpenBookQA: Loading Dataset: 14.53s
685
+ INFO: 2024-07-14 14:22:18,985: llmtf.base.darumeru/ruOpenBookQA: Processing Dataset: 123.84s
686
+ INFO: 2024-07-14 14:22:18,996: llmtf.base.darumeru/ruOpenBookQA: Results for darumeru/ruOpenBookQA:
687
+ INFO: 2024-07-14 14:22:19,089: llmtf.base.darumeru/ruOpenBookQA: {'acc': 0.7074742268041238, 'f1_macro': 0.7072263442662465}
688
+ INFO: 2024-07-14 14:22:19,105: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
689
+ INFO: 2024-07-14 14:22:19,106: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
690
+ INFO: 2024-07-14 14:22:26,540: llmtf.base.darumeru/ruTiE: Loading Dataset: 7.43s
691
+ INFO: 2024-07-14 14:26:04,768: llmtf.base.darumeru/cp_sent_en: Processing Dataset: 381.27s
692
+ INFO: 2024-07-14 14:26:04,771: llmtf.base.darumeru/cp_sent_en: Results for darumeru/cp_sent_en:
693
+ INFO: 2024-07-14 14:26:04,776: llmtf.base.darumeru/cp_sent_en: {'symbol_per_token': 4.556837515131998, 'len': 0.9592170801454492, 'lcs': 0.9978536640150768}
694
+ INFO: 2024-07-14 14:26:04,779: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
695
+ INFO: 2024-07-14 14:26:04,779: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
696
+ INFO: 2024-07-14 14:26:08,974: llmtf.base.darumeru/cp_para_ru: Loading Dataset: 4.19s
697
+ INFO: 2024-07-14 14:26:54,269: llmtf.base.darumeru/ruTiE: Processing Dataset: 267.71s
698
+ INFO: 2024-07-14 14:26:54,270: llmtf.base.darumeru/ruTiE: Results for darumeru/ruTiE:
699
+ INFO: 2024-07-14 14:26:54,301: llmtf.base.darumeru/ruTiE: {'acc': 0.42093023255813955}
700
+ INFO: 2024-07-14 14:26:54,305: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
701
+ INFO: 2024-07-14 14:26:54,305: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
702
+ INFO: 2024-07-14 14:26:56,946: llmtf.base.darumeru/ruWorldTree: Loading Dataset: 2.64s
703
+ INFO: 2024-07-14 14:27:04,165: llmtf.base.darumeru/ruWorldTree: Processing Dataset: 7.22s
704
+ INFO: 2024-07-14 14:27:04,166: llmtf.base.darumeru/ruWorldTree: Results for darumeru/ruWorldTree:
705
+ INFO: 2024-07-14 14:27:04,173: llmtf.base.darumeru/ruWorldTree: {'acc': 0.8380952380952381, 'f1_macro': 0.8343115676204449}
706
+ INFO: 2024-07-14 14:27:04,174: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
707
+ INFO: 2024-07-14 14:27:04,174: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
708
+ INFO: 2024-07-14 14:27:07,701: llmtf.base.darumeru/RWSD: Loading Dataset: 3.53s
709
+ INFO: 2024-07-14 14:27:25,947: llmtf.base.darumeru/RWSD: Processing Dataset: 18.24s
710
+ INFO: 2024-07-14 14:27:25,955: llmtf.base.darumeru/RWSD: Results for darumeru/RWSD:
711
+ INFO: 2024-07-14 14:27:26,055: llmtf.base.darumeru/RWSD: {'acc': 0.49019607843137253}
712
+ INFO: 2024-07-14 14:27:26,057: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
713
+ INFO: 2024-07-14 14:27:26,057: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
714
+ INFO: 2024-07-14 14:27:40,312: llmtf.base.darumeru/USE: Loading Dataset: 14.25s
715
+ INFO: 2024-07-14 14:31:38,677: llmtf.base.darumeru/USE: Processing Dataset: 238.36s
716
+ INFO: 2024-07-14 14:31:38,694: llmtf.base.darumeru/USE: Results for darumeru/USE:
717
+ INFO: 2024-07-14 14:31:38,700: llmtf.base.darumeru/USE: {'grade_norm': 0.10588235294117647}
718
+ INFO: 2024-07-14 14:31:38,707: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
719
+ INFO: 2024-07-14 14:31:38,707: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
720
+ INFO: 2024-07-14 14:31:58,236: llmtf.base.russiannlp/rucola_custom: Loading Dataset: 19.53s
721
+ INFO: 2024-07-14 14:32:44,406: llmtf.base.nlpcoreteam/enMMLU: Processing Dataset: 980.03s
722
+ INFO: 2024-07-14 14:32:44,409: llmtf.base.nlpcoreteam/enMMLU: Results for nlpcoreteam/enMMLU:
723
+ INFO: 2024-07-14 14:32:44,455: llmtf.base.nlpcoreteam/enMMLU: metric
724
+ subject
725
+ abstract_algebra 0.310000
726
+ anatomy 0.696296
727
+ astronomy 0.697368
728
+ business_ethics 0.650000
729
+ clinical_knowledge 0.754717
730
+ college_biology 0.770833
731
+ college_chemistry 0.470000
732
+ college_computer_science 0.470000
733
+ college_mathematics 0.340000
734
+ college_medicine 0.647399
735
+ college_physics 0.500000
736
+ computer_security 0.800000
737
+ conceptual_physics 0.595745
738
+ econometrics 0.526316
739
+ electrical_engineering 0.655172
740
+ elementary_mathematics 0.441799
741
+ formal_logic 0.492063
742
+ global_facts 0.330000
743
+ high_school_biology 0.777419
744
+ high_school_chemistry 0.551724
745
+ high_school_computer_science 0.680000
746
+ high_school_european_history 0.769697
747
+ high_school_geography 0.808081
748
+ high_school_government_and_politics 0.891192
749
+ high_school_macroeconomics 0.653846
750
+ high_school_mathematics 0.392593
751
+ high_school_microeconomics 0.731092
752
+ high_school_physics 0.450331
753
+ high_school_psychology 0.849541
754
+ high_school_statistics 0.541667
755
+ high_school_us_history 0.857843
756
+ high_school_world_history 0.827004
757
+ human_aging 0.713004
758
+ human_sexuality 0.770992
759
+ international_law 0.851240
760
+ jurisprudence 0.759259
761
+ logical_fallacies 0.736196
762
+ machine_learning 0.517857
763
+ management 0.883495
764
+ marketing 0.888889
765
+ medical_genetics 0.790000
766
+ miscellaneous 0.831418
767
+ moral_disputes 0.719653
768
+ moral_scenarios 0.412291
769
+ nutrition 0.767974
770
+ philosophy 0.749196
771
+ prehistory 0.734568
772
+ professional_accounting 0.482270
773
+ professional_law 0.468709
774
+ professional_medicine 0.716912
775
+ professional_psychology 0.722222
776
+ public_relations 0.718182
777
+ security_studies 0.759184
778
+ sociology 0.865672
779
+ us_foreign_policy 0.870000
780
+ virology 0.572289
781
+ world_religions 0.818713
782
+ INFO: 2024-07-14 14:32:44,463: llmtf.base.nlpcoreteam/enMMLU: metric
783
+ subject
784
+ STEM 0.553473
785
+ humanities 0.707418
786
+ other (business, health, misc.) 0.694619
787
+ social sciences 0.763860
788
+ INFO: 2024-07-14 14:32:44,473: llmtf.base.nlpcoreteam/enMMLU: {'acc': 0.6798423546157744}
789
+ INFO: 2024-07-14 14:32:44,541: llmtf.base.evaluator: Ended eval
790
+ INFO: 2024-07-14 14:32:44,569: llmtf.base.evaluator:
791
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
792
+ 0.617 0.251 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.996 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
793
+ INFO: 2024-07-14 14:34:31,976: llmtf.base.darumeru/ruMMLU: Processing Dataset: 1208.60s
794
+ INFO: 2024-07-14 14:34:31,979: llmtf.base.darumeru/ruMMLU: Results for darumeru/ruMMLU:
795
+ INFO: 2024-07-14 14:34:32,006: llmtf.base.darumeru/ruMMLU: {'acc': 0.5003491968472513}
796
+ INFO: 2024-07-14 14:34:32,084: llmtf.base.evaluator: Ended eval
797
+ INFO: 2024-07-14 14:34:32,098: llmtf.base.evaluator:
798
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
799
+ 0.617 0.251 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.996 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
800
+ INFO: 2024-07-14 14:34:43,299: llmtf.base.russiannlp/rucola_custom: Processing Dataset: 165.06s
801
+ INFO: 2024-07-14 14:34:43,302: llmtf.base.russiannlp/rucola_custom: Results for russiannlp/rucola_custom:
802
+ INFO: 2024-07-14 14:34:43,316: llmtf.base.russiannlp/rucola_custom: {'acc': 0.7240760674560459, 'mcc': 0.36043904403572885}
803
+ INFO: 2024-07-14 14:34:43,327: llmtf.base.evaluator: Ended eval
804
+ INFO: 2024-07-14 14:34:43,336: llmtf.base.evaluator:
805
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
806
+ 0.617 0.251 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.996 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
807
+ INFO: 2024-07-14 14:41:14,321: llmtf.base.nlpcoreteam/ruMMLU: Processing Dataset: 1444.32s
808
+ INFO: 2024-07-14 14:41:14,324: llmtf.base.nlpcoreteam/ruMMLU: Results for nlpcoreteam/ruMMLU:
809
+ INFO: 2024-07-14 14:41:14,373: llmtf.base.nlpcoreteam/ruMMLU: metric
810
+ subject
811
+ abstract_algebra 0.350000
812
+ anatomy 0.444444
813
+ astronomy 0.638158
814
+ business_ethics 0.630000
815
+ clinical_knowledge 0.581132
816
+ college_biology 0.569444
817
+ college_chemistry 0.410000
818
+ college_computer_science 0.430000
819
+ college_mathematics 0.340000
820
+ college_medicine 0.549133
821
+ college_physics 0.323529
822
+ computer_security 0.700000
823
+ conceptual_physics 0.527660
824
+ econometrics 0.438596
825
+ electrical_engineering 0.537931
826
+ elementary_mathematics 0.394180
827
+ formal_logic 0.420635
828
+ global_facts 0.330000
829
+ high_school_biology 0.658065
830
+ high_school_chemistry 0.433498
831
+ high_school_computer_science 0.660000
832
+ high_school_european_history 0.727273
833
+ high_school_geography 0.691919
834
+ high_school_government_and_politics 0.683938
835
+ high_school_macroeconomics 0.548718
836
+ high_school_mathematics 0.400000
837
+ high_school_microeconomics 0.525210
838
+ high_school_physics 0.357616
839
+ high_school_psychology 0.662385
840
+ high_school_statistics 0.504630
841
+ high_school_us_history 0.705882
842
+ high_school_world_history 0.742616
843
+ human_aging 0.560538
844
+ human_sexuality 0.625954
845
+ international_law 0.743802
846
+ jurisprudence 0.666667
847
+ logical_fallacies 0.558282
848
+ machine_learning 0.526786
849
+ management 0.757282
850
+ marketing 0.709402
851
+ medical_genetics 0.620000
852
+ miscellaneous 0.629630
853
+ moral_disputes 0.598266
854
+ moral_scenarios 0.392179
855
+ nutrition 0.643791
856
+ philosophy 0.617363
857
+ prehistory 0.583333
858
+ professional_accounting 0.375887
859
+ professional_law 0.384615
860
+ professional_medicine 0.503676
861
+ professional_psychology 0.503268
862
+ public_relations 0.572727
863
+ security_studies 0.669388
864
+ sociology 0.696517
865
+ us_foreign_policy 0.780000
866
+ virology 0.500000
867
+ world_religions 0.672515
868
+ INFO: 2024-07-14 14:41:14,381: llmtf.base.nlpcoreteam/ruMMLU: metric
869
+ subject
870
+ STEM 0.486750
871
+ humanities 0.601033
872
+ other (business, health, misc.) 0.559637
873
+ social sciences 0.616552
874
+ INFO: 2024-07-14 14:41:14,391: llmtf.base.nlpcoreteam/ruMMLU: {'acc': 0.5659927988894412}
875
+ INFO: 2024-07-14 14:41:14,470: llmtf.base.evaluator: Ended eval
876
+ INFO: 2024-07-14 14:41:14,485: llmtf.base.evaluator:
877
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
878
+ 0.617 0.251 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.996 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
879
+ INFO: 2024-07-14 14:48:17,879: llmtf.base.darumeru/cp_para_ru: Processing Dataset: 1328.90s
880
+ INFO: 2024-07-14 14:48:17,896: llmtf.base.darumeru/cp_para_ru: Results for darumeru/cp_para_ru:
881
+ INFO: 2024-07-14 14:48:17,934: llmtf.base.darumeru/cp_para_ru: {'symbol_per_token': 2.9865072713630245, 'len': 0.989199175688307, 'lcs': 0.9976086956521739}
882
+ INFO: 2024-07-14 14:48:17,936: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
883
+ INFO: 2024-07-14 14:48:17,936: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
884
+ INFO: 2024-07-14 14:48:22,484: llmtf.base.darumeru/cp_para_en: Loading Dataset: 4.55s
885
+ INFO: 2024-07-14 14:49:28,678: llmtf.base.daru/treewayabstractive: Processing Dataset: 2167.63s
886
+ INFO: 2024-07-14 14:49:28,679: llmtf.base.daru/treewayabstractive: Results for daru/treewayabstractive:
887
+ INFO: 2024-07-14 14:49:28,685: llmtf.base.daru/treewayabstractive: {'rouge1': 0.3516549334515103, 'rouge2': 0.1390946104887656}
888
+ INFO: 2024-07-14 14:49:28,690: llmtf.base.evaluator: Ended eval
889
+ INFO: 2024-07-14 14:49:28,699: llmtf.base.evaluator:
890
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
891
+ 0.617 0.245 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.998 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
892
+ INFO: 2024-07-14 15:10:39,600: llmtf.base.darumeru/cp_para_en: Processing Dataset: 1337.10s
893
+ INFO: 2024-07-14 15:10:39,604: llmtf.base.darumeru/cp_para_en: Results for darumeru/cp_para_en:
894
+ INFO: 2024-07-14 15:10:39,639: llmtf.base.darumeru/cp_para_en: {'symbol_per_token': 4.528028725817485, 'len': 0.9872908812117563, 'lcs': 0.9883058202112522}
895
+ INFO: 2024-07-14 15:10:39,641: llmtf.base.evaluator: Ended eval
896
+ INFO: 2024-07-14 15:10:39,662: llmtf.base.evaluator:
897
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
898
+ 0.617 0.245 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.998 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542
llmtf_eval_k5/evaluation_results.txt CHANGED
@@ -1,2 +1,2 @@
1
  mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
2
- 0.621 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.988 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
 
1
  mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
2
+ 0.617 0.245 0.407 0.513 0.770 0.412 0.490 0.106 0.988 0.998 0.959 0.964 0.500 0.707 0.421 0.836 0.680 0.566 0.542