RefalMachine commited on
Commit
c67c979
·
verified ·
1 Parent(s): dfd2e61

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. llmtf_eval_k5/daru_treewayabstractive.jsonl +3 -0
  3. llmtf_eval_k5/daru_treewayabstractive_params.jsonl +63 -0
  4. llmtf_eval_k5/daru_treewayabstractive_total.jsonl +8 -0
  5. llmtf_eval_k5/daru_treewayextractive.jsonl +3 -0
  6. llmtf_eval_k5/daru_treewayextractive_params.jsonl +63 -0
  7. llmtf_eval_k5/daru_treewayextractive_total.jsonl +7 -0
  8. llmtf_eval_k5/darumeru_MultiQ.jsonl +3 -0
  9. llmtf_eval_k5/darumeru_MultiQ_params.jsonl +63 -0
  10. llmtf_eval_k5/darumeru_MultiQ_total.jsonl +8 -0
  11. llmtf_eval_k5/darumeru_PARus.jsonl +0 -0
  12. llmtf_eval_k5/darumeru_PARus_params.jsonl +63 -0
  13. llmtf_eval_k5/darumeru_PARus_total.jsonl +7 -0
  14. llmtf_eval_k5/darumeru_RCB.jsonl +0 -0
  15. llmtf_eval_k5/darumeru_RCB_params.jsonl +63 -0
  16. llmtf_eval_k5/darumeru_RCB_total.jsonl +8 -0
  17. llmtf_eval_k5/darumeru_RWSD.jsonl +0 -0
  18. llmtf_eval_k5/darumeru_RWSD_params.jsonl +63 -0
  19. llmtf_eval_k5/darumeru_RWSD_total.jsonl +7 -0
  20. llmtf_eval_k5/darumeru_USE.jsonl +3 -0
  21. llmtf_eval_k5/darumeru_USE_params.jsonl +63 -0
  22. llmtf_eval_k5/darumeru_USE_total.jsonl +7 -0
  23. llmtf_eval_k5/darumeru_cp_para_en.jsonl +0 -0
  24. llmtf_eval_k5/darumeru_cp_para_en_params.jsonl +63 -0
  25. llmtf_eval_k5/darumeru_cp_para_en_total.jsonl +9 -0
  26. llmtf_eval_k5/darumeru_cp_para_ru.jsonl +0 -0
  27. llmtf_eval_k5/darumeru_cp_para_ru_params.jsonl +63 -0
  28. llmtf_eval_k5/darumeru_cp_para_ru_total.jsonl +9 -0
  29. llmtf_eval_k5/darumeru_cp_sent_en.jsonl +0 -0
  30. llmtf_eval_k5/darumeru_cp_sent_en_params.jsonl +63 -0
  31. llmtf_eval_k5/darumeru_cp_sent_en_total.jsonl +9 -0
  32. llmtf_eval_k5/darumeru_cp_sent_ru.jsonl +0 -0
  33. llmtf_eval_k5/darumeru_cp_sent_ru_params.jsonl +63 -0
  34. llmtf_eval_k5/darumeru_cp_sent_ru_total.jsonl +9 -0
  35. llmtf_eval_k5/darumeru_ruMMLU.jsonl +3 -0
  36. llmtf_eval_k5/darumeru_ruMMLU_params.jsonl +63 -0
  37. llmtf_eval_k5/darumeru_ruMMLU_total.jsonl +7 -0
  38. llmtf_eval_k5/darumeru_ruOpenBookQA.jsonl +0 -0
  39. llmtf_eval_k5/darumeru_ruOpenBookQA_params.jsonl +63 -0
  40. llmtf_eval_k5/darumeru_ruOpenBookQA_total.jsonl +8 -0
  41. llmtf_eval_k5/darumeru_ruTiE.jsonl +3 -0
  42. llmtf_eval_k5/darumeru_ruTiE_params.jsonl +63 -0
  43. llmtf_eval_k5/darumeru_ruTiE_total.jsonl +7 -0
  44. llmtf_eval_k5/darumeru_ruWorldTree.jsonl +0 -0
  45. llmtf_eval_k5/darumeru_ruWorldTree_params.jsonl +63 -0
  46. llmtf_eval_k5/darumeru_ruWorldTree_total.jsonl +8 -0
  47. llmtf_eval_k5/evaluation_log.txt +625 -0
  48. llmtf_eval_k5/evaluation_results.txt +2 -0
  49. llmtf_eval_k5/nlpcoreteam_enMMLU.jsonl +3 -0
  50. llmtf_eval_k5/nlpcoreteam_enMMLU_params.jsonl +63 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llmtf_eval_k5/daru_treewayabstractive.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ llmtf_eval_k5/daru_treewayextractive.jsonl filter=lfs diff=lfs merge=lfs -text
38
+ llmtf_eval_k5/darumeru_MultiQ.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ llmtf_eval_k5/darumeru_USE.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ llmtf_eval_k5/darumeru_ruMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ llmtf_eval_k5/darumeru_ruTiE.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ llmtf_eval_k5/nlpcoreteam_enMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ llmtf_eval_k5/nlpcoreteam_ruMMLU.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ llmtf_eval_k5/russiannlp_rucola_custom.jsonl filter=lfs diff=lfs merge=lfs -text
llmtf_eval_k5/daru_treewayabstractive.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de15995ebc4d84f552b2ef27f6a024c14c8231b53d4deec16bc2c0d1650e2d10
3
+ size 13295390
llmtf_eval_k5/daru_treewayabstractive_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 512,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 500,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/daru_treewayabstractive_total.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "daru/treewayabstractive",
3
+ "results": {
4
+ "rouge1": 0.35742299153264667,
5
+ "rouge2": 0.14485242187705508
6
+ },
7
+ "leaderboard_result": 0.25113770670485086
8
+ }
llmtf_eval_k5/daru_treewayextractive.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:543cc3f8a6c50fe3254078ca11bcb684a4bcaa2dee309ae550e39cab39637dc7
3
+ size 259250463
llmtf_eval_k5/daru_treewayextractive_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 1,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 500,
61
+ "method": "calculate_logsoftmax"
62
+ }
63
+ }
llmtf_eval_k5/daru_treewayextractive_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "daru/treewayextractive",
3
+ "results": {
4
+ "r-prec": 0.4072662337662338
5
+ },
6
+ "leaderboard_result": 0.4072662337662338
7
+ }
llmtf_eval_k5/darumeru_MultiQ.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a30b4651bbea85fcfb218ce22c9b9d043be14b0465dfbe3b586a5bb2415da24d
3
+ size 21502472
llmtf_eval_k5/darumeru_MultiQ_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_MultiQ_total.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/MultiQ",
3
+ "results": {
4
+ "f1": 0.5675109413637138,
5
+ "em": 0.4655831739961759
6
+ },
7
+ "leaderboard_result": 0.5165470576799449
8
+ }
llmtf_eval_k5/darumeru_PARus.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_PARus_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_PARus_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/PARus",
3
+ "results": {
4
+ "acc": 0.77
5
+ },
6
+ "leaderboard_result": 0.77
7
+ }
llmtf_eval_k5/darumeru_RCB.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_RCB_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_RCB_total.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/RCB",
3
+ "results": {
4
+ "acc": 0.41363636363636364,
5
+ "f1_macro": 0.4105113251051133
6
+ },
7
+ "leaderboard_result": 0.41207384437073846
8
+ }
llmtf_eval_k5/darumeru_RWSD.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_RWSD_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_RWSD_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/RWSD",
3
+ "results": {
4
+ "acc": 0.49019607843137253
5
+ },
6
+ "leaderboard_result": 0.49019607843137253
7
+ }
llmtf_eval_k5/darumeru_USE.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d6f503952d2f77ca7c0565a8ee72198e10c5de06ed0692811f4c82bf5a28f3e
3
+ size 10594252
llmtf_eval_k5/darumeru_USE_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_USE_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/USE",
3
+ "results": {
4
+ "grade_norm": 0.1019607843137255
5
+ },
6
+ "leaderboard_result": 0.1019607843137255
7
+ }
llmtf_eval_k5/darumeru_cp_para_en.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_para_en_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 1024,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_cp_para_en_total.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/cp_para_en",
3
+ "results": {
4
+ "symbol_per_token": 4.484972311760252,
5
+ "len": 0.999859659310879,
6
+ "lcs": 0.9881793213641535
7
+ },
8
+ "leaderboard_result": 0.9881793213641535
9
+ }
llmtf_eval_k5/darumeru_cp_para_ru.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_para_ru_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 1024,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_cp_para_ru_total.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/cp_para_ru",
3
+ "results": {
4
+ "symbol_per_token": 2.965935841307524,
5
+ "len": 0.9998800850030358,
6
+ "lcs": 0.9964476909825747
7
+ },
8
+ "leaderboard_result": 0.9964476909825747
9
+ }
llmtf_eval_k5/darumeru_cp_sent_en.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_sent_en_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 128,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_cp_sent_en_total.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/cp_sent_en",
3
+ "results": {
4
+ "symbol_per_token": 4.424142837938139,
5
+ "len": 0.9984438516260162,
6
+ "lcs": 0.9974371974918181
7
+ },
8
+ "leaderboard_result": 0.9984438516260162
9
+ }
llmtf_eval_k5/darumeru_cp_sent_ru.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_cp_sent_ru_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 128,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "generate"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_cp_sent_ru_total.jsonl ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/cp_sent_ru",
3
+ "results": {
4
+ "symbol_per_token": 2.8238989214765224,
5
+ "len": 0.9998130402818972,
6
+ "lcs": 0.9997733257303255
7
+ },
8
+ "leaderboard_result": 0.9998130402818972
9
+ }
llmtf_eval_k5/darumeru_ruMMLU.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bb99db9d58a8d033446b02e206aafc18a26b524d4232450b9efca24d769ff65
3
+ size 89350713
llmtf_eval_k5/darumeru_ruMMLU_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_ruMMLU_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/ruMMLU",
3
+ "results": {
4
+ "acc": 0.5003491968472513
5
+ },
6
+ "leaderboard_result": 0.5003491968472513
7
+ }
llmtf_eval_k5/darumeru_ruOpenBookQA.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_ruOpenBookQA_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_ruOpenBookQA_total.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/ruOpenBookQA",
3
+ "results": {
4
+ "acc": 0.7074742268041238,
5
+ "f1_macro": 0.7072263442662465
6
+ },
7
+ "leaderboard_result": 0.7073502855351852
8
+ }
llmtf_eval_k5/darumeru_ruTiE.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0208c8686919aead111cccd9b7680ced6f9ce350d0cc376f4f44d446fda5c21f
3
+ size 12766465
llmtf_eval_k5/darumeru_ruTiE_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_ruTiE_total.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/ruTiE",
3
+ "results": {
4
+ "acc": 0.42093023255813955
5
+ },
6
+ "leaderboard_result": 0.42093023255813955
7
+ }
llmtf_eval_k5/darumeru_ruWorldTree.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
llmtf_eval_k5/darumeru_ruWorldTree_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }
llmtf_eval_k5/darumeru_ruWorldTree_total.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_name": "darumeru/ruWorldTree",
3
+ "results": {
4
+ "acc": 0.8380952380952381,
5
+ "f1_macro": 0.8343115676204449
6
+ },
7
+ "leaderboard_result": 0.8362034028578416
8
+ }
llmtf_eval_k5/evaluation_log.txt ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO: 2024-07-14 10:23:20,130: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/ruopenbookqa', 'darumeru/rutie', 'darumeru/ruworldtree', 'darumeru/rwsd', 'darumeru/use', 'russiannlp/rucola_custom']
2
+ INFO: 2024-07-14 10:23:20,131: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
3
+ INFO: 2024-07-14 10:23:20,132: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
4
+ INFO: 2024-07-14 10:23:21,238: llmtf.base.evaluator: Starting eval on ['darumeru/rummlu']
5
+ INFO: 2024-07-14 10:23:21,239: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
6
+ INFO: 2024-07-14 10:23:21,239: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
7
+ INFO: 2024-07-14 10:23:24,579: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
8
+ INFO: 2024-07-14 10:23:24,580: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
9
+ INFO: 2024-07-14 10:23:24,580: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
10
+ INFO: 2024-07-14 10:23:25,247: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
11
+ INFO: 2024-07-14 10:23:25,259: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
12
+ INFO: 2024-07-14 10:23:25,259: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
13
+ INFO: 2024-07-14 10:23:27,051: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
14
+ INFO: 2024-07-14 10:23:27,051: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
15
+ INFO: 2024-07-14 10:23:27,051: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
16
+ INFO: 2024-07-14 10:23:28,704: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
17
+ INFO: 2024-07-14 10:23:28,704: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
18
+ INFO: 2024-07-14 10:23:28,704: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
19
+ INFO: 2024-07-14 10:23:30,887: llmtf.base.evaluator: Starting eval on ['darumeru/cp_sent_ru', 'darumeru/cp_sent_en', 'darumeru/cp_para_ru', 'darumeru/cp_para_en']
20
+ INFO: 2024-07-14 10:23:30,887: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
21
+ INFO: 2024-07-14 10:23:30,887: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
22
+ INFO: 2024-07-14 10:23:34,905: llmtf.base.darumeru/cp_sent_ru: Loading Dataset: 4.02s
23
+ INFO: 2024-07-14 10:23:40,350: llmtf.base.daru/treewayextractive: Loading Dataset: 11.65s
24
+ INFO: 2024-07-14 10:23:41,282: llmtf.base.darumeru/MultiQ: Loading Dataset: 21.15s
25
+ INFO: 2024-07-14 10:23:44,620: llmtf.base.daru/treewayabstractive: Loading Dataset: 17.57s
26
+ INFO: 2024-07-14 10:24:47,423: llmtf.base.darumeru/ruMMLU: Loading Dataset: 86.18s
27
+ INFO: 2024-07-14 10:26:47,793: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 202.53s
28
+ INFO: 2024-07-14 10:27:33,250: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 248.67s
29
+ INFO: 2024-07-14 10:29:44,105: llmtf.base.darumeru/MultiQ: Processing Dataset: 362.82s
30
+ INFO: 2024-07-14 10:29:44,107: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
31
+ INFO: 2024-07-14 10:29:44,126: llmtf.base.darumeru/MultiQ: {'f1': 0.569265566933374, 'em': 0.4655831739961759}
32
+ INFO: 2024-07-14 10:29:44,137: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
33
+ INFO: 2024-07-14 10:29:44,137: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
34
+ INFO: 2024-07-14 10:29:47,161: llmtf.base.darumeru/PARus: Loading Dataset: 3.02s
35
+ INFO: 2024-07-14 10:29:59,369: llmtf.base.darumeru/PARus: Processing Dataset: 12.21s
36
+ INFO: 2024-07-14 10:29:59,371: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
37
+ INFO: 2024-07-14 10:29:59,397: llmtf.base.darumeru/PARus: {'acc': 0.77}
38
+ INFO: 2024-07-14 10:29:59,399: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
39
+ INFO: 2024-07-14 10:29:59,399: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
40
+ INFO: 2024-07-14 10:30:03,489: llmtf.base.darumeru/RCB: Loading Dataset: 4.09s
41
+ INFO: 2024-07-14 10:30:09,527: llmtf.base.darumeru/cp_sent_ru: Processing Dataset: 394.62s
42
+ INFO: 2024-07-14 10:30:09,530: llmtf.base.darumeru/cp_sent_ru: Results for darumeru/cp_sent_ru:
43
+ INFO: 2024-07-14 10:30:09,551: llmtf.base.darumeru/cp_sent_ru: {'symbol_per_token': 2.854204461218866, 'len': 0.3139758726899384, 'lcs': 0.3140327089331601}
44
+ INFO: 2024-07-14 10:30:09,554: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
45
+ INFO: 2024-07-14 10:30:09,554: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
46
+ INFO: 2024-07-14 10:30:09,713: llmtf.base.daru/treewayextractive: Processing Dataset: 389.35s
47
+ INFO: 2024-07-14 10:30:09,714: llmtf.base.daru/treewayextractive: Results for daru/treewayextractive:
48
+ INFO: 2024-07-14 10:30:09,932: llmtf.base.daru/treewayextractive: {'r-prec': 0.4072662337662338}
49
+ INFO: 2024-07-14 10:30:09,977: llmtf.base.evaluator: Ended eval
50
+ INFO: 2024-07-14 10:30:09,982: llmtf.base.evaluator:
51
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/cp_sent_ru
52
+ 0.502 0.407 0.517 0.770 0.314
53
+ INFO: 2024-07-14 10:30:12,945: llmtf.base.darumeru/cp_sent_en: Loading Dataset: 3.39s
54
+ INFO: 2024-07-14 10:30:24,638: llmtf.base.darumeru/RCB: Processing Dataset: 21.15s
55
+ INFO: 2024-07-14 10:30:24,641: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
56
+ INFO: 2024-07-14 10:30:24,676: llmtf.base.darumeru/RCB: {'acc': 0.41363636363636364, 'f1_macro': 0.4105113251051133}
57
+ INFO: 2024-07-14 10:30:24,688: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
58
+ INFO: 2024-07-14 10:30:24,688: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
59
+ INFO: 2024-07-14 10:30:38,213: llmtf.base.darumeru/ruOpenBookQA: Loading Dataset: 13.52s
60
+ INFO: 2024-07-14 10:32:43,135: llmtf.base.darumeru/ruOpenBookQA: Processing Dataset: 124.91s
61
+ INFO: 2024-07-14 10:32:43,137: llmtf.base.darumeru/ruOpenBookQA: Results for darumeru/ruOpenBookQA:
62
+ INFO: 2024-07-14 10:32:43,165: llmtf.base.darumeru/ruOpenBookQA: {'acc': 0.7074742268041238, 'f1_macro': 0.7072263442662465}
63
+ INFO: 2024-07-14 10:32:43,181: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
64
+ INFO: 2024-07-14 10:32:43,181: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
65
+ INFO: 2024-07-14 10:32:50,754: llmtf.base.darumeru/ruTiE: Loading Dataset: 7.57s
66
+ INFO: 2024-07-14 10:36:41,628: llmtf.base.darumeru/cp_sent_en: Processing Dataset: 388.68s
67
+ INFO: 2024-07-14 10:36:41,661: llmtf.base.darumeru/cp_sent_en: Results for darumeru/cp_sent_en:
68
+ INFO: 2024-07-14 10:36:41,680: llmtf.base.darumeru/cp_sent_en: {'symbol_per_token': 4.414480033245095, 'len': 0.24456935975609756, 'lcs': 0.2492914705708347}
69
+ INFO: 2024-07-14 10:36:41,682: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
70
+ INFO: 2024-07-14 10:36:41,682: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
71
+ INFO: 2024-07-14 10:36:45,508: llmtf.base.darumeru/cp_para_ru: Loading Dataset: 3.83s
72
+ INFO: 2024-07-14 10:37:21,211: llmtf.base.darumeru/ruTiE: Processing Dataset: 270.44s
73
+ INFO: 2024-07-14 10:37:21,212: llmtf.base.darumeru/ruTiE: Results for darumeru/ruTiE:
74
+ INFO: 2024-07-14 10:37:21,241: llmtf.base.darumeru/ruTiE: {'acc': 0.42093023255813955}
75
+ INFO: 2024-07-14 10:37:21,244: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
76
+ INFO: 2024-07-14 10:37:21,244: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
77
+ INFO: 2024-07-14 10:37:23,894: llmtf.base.darumeru/ruWorldTree: Loading Dataset: 2.65s
78
+ INFO: 2024-07-14 10:37:31,204: llmtf.base.darumeru/ruWorldTree: Processing Dataset: 7.31s
79
+ INFO: 2024-07-14 10:37:31,206: llmtf.base.darumeru/ruWorldTree: Results for darumeru/ruWorldTree:
80
+ INFO: 2024-07-14 10:37:31,211: llmtf.base.darumeru/ruWorldTree: {'acc': 0.8380952380952381, 'f1_macro': 0.8343115676204449}
81
+ INFO: 2024-07-14 10:37:31,213: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
82
+ INFO: 2024-07-14 10:37:31,213: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
83
+ INFO: 2024-07-14 10:37:34,764: llmtf.base.darumeru/RWSD: Loading Dataset: 3.55s
84
+ INFO: 2024-07-14 10:37:53,226: llmtf.base.darumeru/RWSD: Processing Dataset: 18.46s
85
+ INFO: 2024-07-14 10:37:53,244: llmtf.base.darumeru/RWSD: Results for darumeru/RWSD:
86
+ INFO: 2024-07-14 10:37:53,261: llmtf.base.darumeru/RWSD: {'acc': 0.49019607843137253}
87
+ INFO: 2024-07-14 10:37:53,263: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
88
+ INFO: 2024-07-14 10:37:53,263: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
89
+ INFO: 2024-07-14 10:38:07,648: llmtf.base.darumeru/USE: Loading Dataset: 14.38s
90
+ INFO: 2024-07-14 10:42:11,801: llmtf.base.darumeru/USE: Processing Dataset: 244.14s
91
+ INFO: 2024-07-14 10:42:11,819: llmtf.base.darumeru/USE: Results for darumeru/USE:
92
+ INFO: 2024-07-14 10:42:11,839: llmtf.base.darumeru/USE: {'grade_norm': 0.10882352941176472}
93
+ INFO: 2024-07-14 10:42:11,846: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
94
+ INFO: 2024-07-14 10:42:11,846: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
95
+ INFO: 2024-07-14 10:42:30,996: llmtf.base.russiannlp/rucola_custom: Loading Dataset: 19.15s
96
+ INFO: 2024-07-14 10:43:13,353: llmtf.base.nlpcoreteam/enMMLU: Processing Dataset: 985.56s
97
+ INFO: 2024-07-14 10:43:13,356: llmtf.base.nlpcoreteam/enMMLU: Results for nlpcoreteam/enMMLU:
98
+ INFO: 2024-07-14 10:43:13,402: llmtf.base.nlpcoreteam/enMMLU: metric
99
+ subject
100
+ abstract_algebra 0.310000
101
+ anatomy 0.696296
102
+ astronomy 0.697368
103
+ business_ethics 0.650000
104
+ clinical_knowledge 0.754717
105
+ college_biology 0.770833
106
+ college_chemistry 0.470000
107
+ college_computer_science 0.470000
108
+ college_mathematics 0.340000
109
+ college_medicine 0.647399
110
+ college_physics 0.500000
111
+ computer_security 0.800000
112
+ conceptual_physics 0.595745
113
+ econometrics 0.526316
114
+ electrical_engineering 0.655172
115
+ elementary_mathematics 0.441799
116
+ formal_logic 0.492063
117
+ global_facts 0.330000
118
+ high_school_biology 0.777419
119
+ high_school_chemistry 0.551724
120
+ high_school_computer_science 0.680000
121
+ high_school_european_history 0.769697
122
+ high_school_geography 0.808081
123
+ high_school_government_and_politics 0.891192
124
+ high_school_macroeconomics 0.653846
125
+ high_school_mathematics 0.392593
126
+ high_school_microeconomics 0.731092
127
+ high_school_physics 0.450331
128
+ high_school_psychology 0.849541
129
+ high_school_statistics 0.541667
130
+ high_school_us_history 0.857843
131
+ high_school_world_history 0.827004
132
+ human_aging 0.713004
133
+ human_sexuality 0.770992
134
+ international_law 0.851240
135
+ jurisprudence 0.759259
136
+ logical_fallacies 0.736196
137
+ machine_learning 0.517857
138
+ management 0.883495
139
+ marketing 0.888889
140
+ medical_genetics 0.790000
141
+ miscellaneous 0.831418
142
+ moral_disputes 0.719653
143
+ moral_scenarios 0.412291
144
+ nutrition 0.767974
145
+ philosophy 0.749196
146
+ prehistory 0.734568
147
+ professional_accounting 0.482270
148
+ professional_law 0.468709
149
+ professional_medicine 0.716912
150
+ professional_psychology 0.722222
151
+ public_relations 0.718182
152
+ security_studies 0.759184
153
+ sociology 0.865672
154
+ us_foreign_policy 0.870000
155
+ virology 0.572289
156
+ world_religions 0.818713
157
+ INFO: 2024-07-14 10:43:13,410: llmtf.base.nlpcoreteam/enMMLU: metric
158
+ subject
159
+ STEM 0.553473
160
+ humanities 0.707418
161
+ other (business, health, misc.) 0.694619
162
+ social sciences 0.763860
163
+ INFO: 2024-07-14 10:43:13,431: llmtf.base.nlpcoreteam/enMMLU: {'acc': 0.6798423546157744}
164
+ INFO: 2024-07-14 10:43:13,499: llmtf.base.evaluator: Ended eval
165
+ INFO: 2024-07-14 10:43:13,515: llmtf.base.evaluator:
166
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU
167
+ 0.492 0.407 0.517 0.770 0.412 0.490 0.109 0.245 0.314 0.707 0.421 0.836 0.680
168
+ INFO: 2024-07-14 10:45:03,288: llmtf.base.darumeru/ruMMLU: Processing Dataset: 1215.85s
169
+ INFO: 2024-07-14 10:45:03,291: llmtf.base.darumeru/ruMMLU: Results for darumeru/ruMMLU:
170
+ INFO: 2024-07-14 10:45:03,300: llmtf.base.darumeru/ruMMLU: {'acc': 0.5003491968472513}
171
+ INFO: 2024-07-14 10:45:03,379: llmtf.base.evaluator: Ended eval
172
+ INFO: 2024-07-14 10:45:03,395: llmtf.base.evaluator:
173
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU
174
+ 0.493 0.407 0.517 0.770 0.412 0.490 0.109 0.245 0.314 0.500 0.707 0.421 0.836 0.680
175
+ INFO: 2024-07-14 10:45:17,598: llmtf.base.russiannlp/rucola_custom: Processing Dataset: 166.60s
176
+ INFO: 2024-07-14 10:45:17,602: llmtf.base.russiannlp/rucola_custom: Results for russiannlp/rucola_custom:
177
+ INFO: 2024-07-14 10:45:17,620: llmtf.base.russiannlp/rucola_custom: {'acc': 0.7240760674560459, 'mcc': 0.36043904403572885}
178
+ INFO: 2024-07-14 10:45:17,631: llmtf.base.evaluator: Ended eval
179
+ INFO: 2024-07-14 10:45:17,732: llmtf.base.evaluator:
180
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU russiannlp/rucola_custom
181
+ 0.497 0.407 0.517 0.770 0.412 0.490 0.109 0.245 0.314 0.500 0.707 0.421 0.836 0.680 0.542
182
+ INFO: 2024-07-14 10:51:53,998: llmtf.base.nlpcoreteam/ruMMLU: Processing Dataset: 1460.75s
183
+ INFO: 2024-07-14 10:51:54,015: llmtf.base.nlpcoreteam/ruMMLU: Results for nlpcoreteam/ruMMLU:
184
+ INFO: 2024-07-14 10:51:54,061: llmtf.base.nlpcoreteam/ruMMLU: metric
185
+ subject
186
+ abstract_algebra 0.350000
187
+ anatomy 0.444444
188
+ astronomy 0.638158
189
+ business_ethics 0.630000
190
+ clinical_knowledge 0.581132
191
+ college_biology 0.569444
192
+ college_chemistry 0.410000
193
+ college_computer_science 0.430000
194
+ college_mathematics 0.340000
195
+ college_medicine 0.549133
196
+ college_physics 0.323529
197
+ computer_security 0.700000
198
+ conceptual_physics 0.527660
199
+ econometrics 0.438596
200
+ electrical_engineering 0.537931
201
+ elementary_mathematics 0.394180
202
+ formal_logic 0.420635
203
+ global_facts 0.330000
204
+ high_school_biology 0.658065
205
+ high_school_chemistry 0.433498
206
+ high_school_computer_science 0.660000
207
+ high_school_european_history 0.727273
208
+ high_school_geography 0.691919
209
+ high_school_government_and_politics 0.683938
210
+ high_school_macroeconomics 0.548718
211
+ high_school_mathematics 0.400000
212
+ high_school_microeconomics 0.525210
213
+ high_school_physics 0.357616
214
+ high_school_psychology 0.662385
215
+ high_school_statistics 0.504630
216
+ high_school_us_history 0.705882
217
+ high_school_world_history 0.742616
218
+ human_aging 0.560538
219
+ human_sexuality 0.625954
220
+ international_law 0.743802
221
+ jurisprudence 0.666667
222
+ logical_fallacies 0.558282
223
+ machine_learning 0.526786
224
+ management 0.757282
225
+ marketing 0.709402
226
+ medical_genetics 0.620000
227
+ miscellaneous 0.629630
228
+ moral_disputes 0.598266
229
+ moral_scenarios 0.392179
230
+ nutrition 0.643791
231
+ philosophy 0.617363
232
+ prehistory 0.583333
233
+ professional_accounting 0.375887
234
+ professional_law 0.384615
235
+ professional_medicine 0.503676
236
+ professional_psychology 0.503268
237
+ public_relations 0.572727
238
+ security_studies 0.669388
239
+ sociology 0.696517
240
+ us_foreign_policy 0.780000
241
+ virology 0.500000
242
+ world_religions 0.672515
243
+ INFO: 2024-07-14 10:51:54,068: llmtf.base.nlpcoreteam/ruMMLU: metric
244
+ subject
245
+ STEM 0.486750
246
+ humanities 0.601033
247
+ other (business, health, misc.) 0.559637
248
+ social sciences 0.616552
249
+ INFO: 2024-07-14 10:51:54,076: llmtf.base.nlpcoreteam/ruMMLU: {'acc': 0.5659927988894412}
250
+ INFO: 2024-07-14 10:51:54,155: llmtf.base.evaluator: Ended eval
251
+ INFO: 2024-07-14 10:51:54,169: llmtf.base.evaluator:
252
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
253
+ 0.501 0.407 0.517 0.770 0.412 0.490 0.109 0.245 0.314 0.500 0.707 0.421 0.836 0.680 0.566 0.542
254
+ INFO: 2024-07-14 10:59:26,805: llmtf.base.darumeru/cp_para_ru: Processing Dataset: 1361.30s
255
+ INFO: 2024-07-14 10:59:26,809: llmtf.base.darumeru/cp_para_ru: Results for darumeru/cp_para_ru:
256
+ INFO: 2024-07-14 10:59:26,814: llmtf.base.darumeru/cp_para_ru: {'symbol_per_token': 2.8209077531855185, 'len': 0.1755224609375, 'lcs': 0.1860311194855058}
257
+ INFO: 2024-07-14 10:59:26,815: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
258
+ INFO: 2024-07-14 10:59:26,815: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
259
+ INFO: 2024-07-14 10:59:30,799: llmtf.base.darumeru/cp_para_en: Loading Dataset: 3.98s
260
+ INFO: 2024-07-14 11:06:06,415: llmtf.base.evaluator: Starting eval on ['darumeru/rummlu']
261
+ INFO: 2024-07-14 11:06:06,418: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
262
+ INFO: 2024-07-14 11:06:06,418: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
263
+ INFO: 2024-07-14 11:06:06,474: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/ruopenbookqa', 'darumeru/rutie', 'darumeru/ruworldtree', 'darumeru/rwsd', 'darumeru/use', 'russiannlp/rucola_custom']
264
+ INFO: 2024-07-14 11:06:06,475: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
265
+ INFO: 2024-07-14 11:06:06,475: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
266
+ INFO: 2024-07-14 11:06:06,800: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
267
+ INFO: 2024-07-14 11:06:06,801: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
268
+ INFO: 2024-07-14 11:06:06,801: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
269
+ INFO: 2024-07-14 11:06:06,963: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
270
+ INFO: 2024-07-14 11:06:06,963: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
271
+ INFO: 2024-07-14 11:06:06,963: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
272
+ INFO: 2024-07-14 11:06:08,918: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
273
+ INFO: 2024-07-14 11:06:08,935: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
274
+ INFO: 2024-07-14 11:06:08,935: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
275
+ INFO: 2024-07-14 11:06:10,139: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
276
+ INFO: 2024-07-14 11:06:10,140: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
277
+ INFO: 2024-07-14 11:06:10,140: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
278
+ INFO: 2024-07-14 11:06:12,863: llmtf.base.evaluator: Starting eval on ['darumeru/cp_sent_ru', 'darumeru/cp_sent_en', 'darumeru/cp_para_ru', 'darumeru/cp_para_en']
279
+ INFO: 2024-07-14 11:06:12,864: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
280
+ INFO: 2024-07-14 11:06:12,864: llmtf.base.hfmodel: Updated generation_config.stop_strings: []
281
+ INFO: 2024-07-14 11:06:17,073: llmtf.base.darumeru/cp_sent_ru: Loading Dataset: 4.21s
282
+ INFO: 2024-07-14 11:06:22,184: llmtf.base.daru/treewayextractive: Loading Dataset: 12.04s
283
+ INFO: 2024-07-14 11:06:26,015: llmtf.base.daru/treewayabstractive: Loading Dataset: 17.08s
284
+ INFO: 2024-07-14 11:06:28,367: llmtf.base.darumeru/MultiQ: Loading Dataset: 21.89s
285
+ INFO: 2024-07-14 11:07:31,487: llmtf.base.darumeru/ruMMLU: Loading Dataset: 85.07s
286
+ INFO: 2024-07-14 11:09:34,167: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 207.20s
287
+ INFO: 2024-07-14 11:10:20,566: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 253.76s
288
+ INFO: 2024-07-14 11:13:00,146: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/ruopenbookqa', 'darumeru/rutie', 'darumeru/ruworldtree', 'darumeru/rwsd', 'darumeru/use', 'russiannlp/rucola_custom']
289
+ INFO: 2024-07-14 11:13:00,147: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
290
+ INFO: 2024-07-14 11:13:00,148: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
291
+ INFO: 2024-07-14 11:13:00,763: llmtf.base.evaluator: Starting eval on ['darumeru/rummlu']
292
+ INFO: 2024-07-14 11:13:00,764: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
293
+ INFO: 2024-07-14 11:13:00,764: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
294
+ INFO: 2024-07-14 11:13:01,156: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
295
+ INFO: 2024-07-14 11:13:01,157: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
296
+ INFO: 2024-07-14 11:13:01,157: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
297
+ INFO: 2024-07-14 11:13:03,344: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
298
+ INFO: 2024-07-14 11:13:03,344: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
299
+ INFO: 2024-07-14 11:13:03,345: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
300
+ INFO: 2024-07-14 11:13:05,847: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
301
+ INFO: 2024-07-14 11:13:05,847: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
302
+ INFO: 2024-07-14 11:13:05,847: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
303
+ INFO: 2024-07-14 11:13:07,688: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
304
+ INFO: 2024-07-14 11:13:07,689: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
305
+ INFO: 2024-07-14 11:13:07,690: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
306
+ INFO: 2024-07-14 11:13:08,542: llmtf.base.evaluator: Starting eval on ['darumeru/cp_sent_ru', 'darumeru/cp_sent_en', 'darumeru/cp_para_ru', 'darumeru/cp_para_en']
307
+ INFO: 2024-07-14 11:13:08,556: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
308
+ INFO: 2024-07-14 11:13:08,556: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
309
+ INFO: 2024-07-14 11:13:13,247: llmtf.base.darumeru/cp_sent_ru: Loading Dataset: 4.69s
310
+ INFO: 2024-07-14 11:13:19,620: llmtf.base.daru/treewayextractive: Loading Dataset: 11.93s
311
+ INFO: 2024-07-14 11:13:21,465: llmtf.base.darumeru/MultiQ: Loading Dataset: 21.32s
312
+ INFO: 2024-07-14 11:13:22,631: llmtf.base.daru/treewayabstractive: Loading Dataset: 16.78s
313
+ INFO: 2024-07-14 11:14:26,160: llmtf.base.darumeru/ruMMLU: Loading Dataset: 85.40s
314
+ INFO: 2024-07-14 11:16:29,276: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 205.93s
315
+ INFO: 2024-07-14 11:17:11,778: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 250.62s
316
+ INFO: 2024-07-14 11:19:17,117: llmtf.base.darumeru/MultiQ: Processing Dataset: 355.65s
317
+ INFO: 2024-07-14 11:19:17,119: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
318
+ INFO: 2024-07-14 11:19:17,142: llmtf.base.darumeru/MultiQ: {'f1': 0.5670386205042036, 'em': 0.4646271510516252}
319
+ INFO: 2024-07-14 11:19:17,153: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
320
+ INFO: 2024-07-14 11:19:17,153: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
321
+ INFO: 2024-07-14 11:19:20,859: llmtf.base.darumeru/PARus: Loading Dataset: 3.70s
322
+ INFO: 2024-07-14 11:19:32,984: llmtf.base.darumeru/PARus: Processing Dataset: 12.11s
323
+ INFO: 2024-07-14 11:19:32,999: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
324
+ INFO: 2024-07-14 11:19:33,016: llmtf.base.darumeru/PARus: {'acc': 0.77}
325
+ INFO: 2024-07-14 11:19:33,019: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
326
+ INFO: 2024-07-14 11:19:33,019: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
327
+ INFO: 2024-07-14 11:19:37,041: llmtf.base.darumeru/RCB: Loading Dataset: 4.02s
328
+ INFO: 2024-07-14 11:19:40,945: llmtf.base.darumeru/cp_sent_ru: Processing Dataset: 387.69s
329
+ INFO: 2024-07-14 11:19:40,949: llmtf.base.darumeru/cp_sent_ru: Results for darumeru/cp_sent_ru:
330
+ INFO: 2024-07-14 11:19:40,956: llmtf.base.darumeru/cp_sent_ru: {'symbol_per_token': 2.852724111665359, 'len': 0.3139758726899384, 'lcs': 0.3142940152947527}
331
+ INFO: 2024-07-14 11:19:40,959: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
332
+ INFO: 2024-07-14 11:19:40,959: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
333
+ INFO: 2024-07-14 11:19:44,820: llmtf.base.darumeru/cp_sent_en: Loading Dataset: 3.86s
334
+ INFO: 2024-07-14 11:19:54,656: llmtf.base.daru/treewayextractive: Processing Dataset: 395.02s
335
+ INFO: 2024-07-14 11:19:54,657: llmtf.base.daru/treewayextractive: Results for daru/treewayextractive:
336
+ INFO: 2024-07-14 11:19:54,894: llmtf.base.daru/treewayextractive: {'r-prec': 0.4072662337662338}
337
+ INFO: 2024-07-14 11:19:54,939: llmtf.base.evaluator: Ended eval
338
+ INFO: 2024-07-14 11:19:54,959: llmtf.base.evaluator:
339
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
340
+ 0.481 0.407 0.516 0.770 0.412 0.490 0.109 0.186 0.245 0.314 0.500 0.707 0.421 0.836 0.680 0.566 0.542
341
+ INFO: 2024-07-14 11:19:58,004: llmtf.base.darumeru/RCB: Processing Dataset: 20.95s
342
+ INFO: 2024-07-14 11:19:58,006: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
343
+ INFO: 2024-07-14 11:19:58,016: llmtf.base.darumeru/RCB: {'acc': 0.41363636363636364, 'f1_macro': 0.4105113251051133}
344
+ INFO: 2024-07-14 11:19:58,018: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
345
+ INFO: 2024-07-14 11:19:58,019: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
346
+ INFO: 2024-07-14 11:20:11,977: llmtf.base.darumeru/ruOpenBookQA: Loading Dataset: 13.96s
347
+ INFO: 2024-07-14 11:22:15,668: llmtf.base.darumeru/ruOpenBookQA: Processing Dataset: 123.69s
348
+ INFO: 2024-07-14 11:22:15,686: llmtf.base.darumeru/ruOpenBookQA: Results for darumeru/ruOpenBookQA:
349
+ INFO: 2024-07-14 11:22:15,718: llmtf.base.darumeru/ruOpenBookQA: {'acc': 0.7074742268041238, 'f1_macro': 0.7072263442662465}
350
+ INFO: 2024-07-14 11:22:15,734: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
351
+ INFO: 2024-07-14 11:22:15,734: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
352
+ INFO: 2024-07-14 11:22:22,911: llmtf.base.darumeru/ruTiE: Loading Dataset: 7.18s
353
+ INFO: 2024-07-14 11:31:34,412: llmtf.base.evaluator: Starting eval on ['darumeru/cp_sent_ru', 'darumeru/cp_sent_en', 'darumeru/cp_para_ru', 'darumeru/cp_para_en']
354
+ INFO: 2024-07-14 11:31:34,413: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
355
+ INFO: 2024-07-14 11:31:34,413: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
356
+ INFO: 2024-07-14 11:31:36,398: llmtf.base.evaluator: Starting eval on ['darumeru/multiq', 'darumeru/parus', 'darumeru/rcb', 'darumeru/ruopenbookqa', 'darumeru/rutie', 'darumeru/ruworldtree', 'darumeru/rwsd', 'darumeru/use', 'russiannlp/rucola_custom']
357
+ INFO: 2024-07-14 11:31:36,399: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
358
+ INFO: 2024-07-14 11:31:36,399: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
359
+ INFO: 2024-07-14 11:31:36,412: llmtf.base.evaluator: Starting eval on ['darumeru/rummlu']
360
+ INFO: 2024-07-14 11:31:36,413: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
361
+ INFO: 2024-07-14 11:31:36,413: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
362
+ INFO: 2024-07-14 11:31:36,456: llmtf.base.evaluator: Starting eval on ['daru/treewayextractive']
363
+ INFO: 2024-07-14 11:31:36,457: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
364
+ INFO: 2024-07-14 11:31:36,457: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
365
+ INFO: 2024-07-14 11:31:36,781: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/enmmlu']
366
+ INFO: 2024-07-14 11:31:36,782: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
367
+ INFO: 2024-07-14 11:31:36,782: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
368
+ INFO: 2024-07-14 11:31:37,004: llmtf.base.evaluator: Starting eval on ['daru/treewayabstractive']
369
+ INFO: 2024-07-14 11:31:37,005: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
370
+ INFO: 2024-07-14 11:31:37,005: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
371
+ INFO: 2024-07-14 11:31:37,196: llmtf.base.evaluator: Starting eval on ['nlpcoreteam/rummlu']
372
+ INFO: 2024-07-14 11:31:37,196: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
373
+ INFO: 2024-07-14 11:31:37,196: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
374
+ INFO: 2024-07-14 11:31:38,743: llmtf.base.darumeru/cp_sent_ru: Loading Dataset: 4.33s
375
+ INFO: 2024-07-14 11:31:48,581: llmtf.base.daru/treewayextractive: Loading Dataset: 12.12s
376
+ INFO: 2024-07-14 11:31:53,872: llmtf.base.daru/treewayabstractive: Loading Dataset: 16.87s
377
+ INFO: 2024-07-14 11:31:57,994: llmtf.base.darumeru/MultiQ: Loading Dataset: 21.59s
378
+ INFO: 2024-07-14 11:33:00,585: llmtf.base.darumeru/ruMMLU: Loading Dataset: 84.17s
379
+ INFO: 2024-07-14 11:35:00,486: llmtf.base.nlpcoreteam/enMMLU: Loading Dataset: 203.70s
380
+ INFO: 2024-07-14 11:35:45,376: llmtf.base.nlpcoreteam/ruMMLU: Loading Dataset: 248.18s
381
+ INFO: 2024-07-14 11:37:58,433: llmtf.base.darumeru/MultiQ: Processing Dataset: 360.42s
382
+ INFO: 2024-07-14 11:37:58,449: llmtf.base.darumeru/MultiQ: Results for darumeru/MultiQ:
383
+ INFO: 2024-07-14 11:37:58,477: llmtf.base.darumeru/MultiQ: {'f1': 0.5675109413637138, 'em': 0.4655831739961759}
384
+ INFO: 2024-07-14 11:37:58,487: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
385
+ INFO: 2024-07-14 11:37:58,488: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
386
+ INFO: 2024-07-14 11:38:02,069: llmtf.base.darumeru/PARus: Loading Dataset: 3.58s
387
+ INFO: 2024-07-14 11:38:11,194: llmtf.base.darumeru/cp_sent_ru: Processing Dataset: 392.45s
388
+ INFO: 2024-07-14 11:38:11,198: llmtf.base.darumeru/cp_sent_ru: Results for darumeru/cp_sent_ru:
389
+ INFO: 2024-07-14 11:38:11,219: llmtf.base.darumeru/cp_sent_ru: {'symbol_per_token': 2.8238989214765224, 'len': 0.9998130402818972, 'lcs': 0.9997733257303255}
390
+ INFO: 2024-07-14 11:38:11,222: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
391
+ INFO: 2024-07-14 11:38:11,222: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
392
+ INFO: 2024-07-14 11:38:14,071: llmtf.base.darumeru/PARus: Processing Dataset: 12.00s
393
+ INFO: 2024-07-14 11:38:14,073: llmtf.base.darumeru/PARus: Results for darumeru/PARus:
394
+ INFO: 2024-07-14 11:38:14,103: llmtf.base.darumeru/PARus: {'acc': 0.77}
395
+ INFO: 2024-07-14 11:38:14,105: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
396
+ INFO: 2024-07-14 11:38:14,105: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
397
+ INFO: 2024-07-14 11:38:15,216: llmtf.base.darumeru/cp_sent_en: Loading Dataset: 3.99s
398
+ INFO: 2024-07-14 11:38:17,615: llmtf.base.darumeru/RCB: Loading Dataset: 3.51s
399
+ INFO: 2024-07-14 11:38:22,819: llmtf.base.daru/treewayextractive: Processing Dataset: 394.24s
400
+ INFO: 2024-07-14 11:38:22,820: llmtf.base.daru/treewayextractive: Results for daru/treewayextractive:
401
+ INFO: 2024-07-14 11:38:23,092: llmtf.base.daru/treewayextractive: {'r-prec': 0.4072662337662338}
402
+ INFO: 2024-07-14 11:38:23,137: llmtf.base.evaluator: Ended eval
403
+ INFO: 2024-07-14 11:38:23,245: llmtf.base.evaluator:
404
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
405
+ 0.524 0.407 0.517 0.770 0.412 0.490 0.109 0.186 0.245 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
406
+ INFO: 2024-07-14 11:38:38,403: llmtf.base.darumeru/RCB: Processing Dataset: 20.75s
407
+ INFO: 2024-07-14 11:38:38,418: llmtf.base.darumeru/RCB: Results for darumeru/RCB:
408
+ INFO: 2024-07-14 11:38:38,456: llmtf.base.darumeru/RCB: {'acc': 0.41363636363636364, 'f1_macro': 0.4105113251051133}
409
+ INFO: 2024-07-14 11:38:38,458: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
410
+ INFO: 2024-07-14 11:38:38,458: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
411
+ INFO: 2024-07-14 11:38:52,408: llmtf.base.darumeru/ruOpenBookQA: Loading Dataset: 13.95s
412
+ INFO: 2024-07-14 11:40:55,434: llmtf.base.darumeru/ruOpenBookQA: Processing Dataset: 123.01s
413
+ INFO: 2024-07-14 11:40:55,437: llmtf.base.darumeru/ruOpenBookQA: Results for darumeru/ruOpenBookQA:
414
+ INFO: 2024-07-14 11:40:55,468: llmtf.base.darumeru/ruOpenBookQA: {'acc': 0.7074742268041238, 'f1_macro': 0.7072263442662465}
415
+ INFO: 2024-07-14 11:40:55,484: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
416
+ INFO: 2024-07-14 11:40:55,484: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
417
+ INFO: 2024-07-14 11:41:02,650: llmtf.base.darumeru/ruTiE: Loading Dataset: 7.16s
418
+ INFO: 2024-07-14 11:44:41,336: llmtf.base.darumeru/cp_sent_en: Processing Dataset: 386.12s
419
+ INFO: 2024-07-14 11:44:41,339: llmtf.base.darumeru/cp_sent_en: Results for darumeru/cp_sent_en:
420
+ INFO: 2024-07-14 11:44:41,372: llmtf.base.darumeru/cp_sent_en: {'symbol_per_token': 4.424142837938139, 'len': 0.9984438516260162, 'lcs': 0.9974371974918181}
421
+ INFO: 2024-07-14 11:44:41,375: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
422
+ INFO: 2024-07-14 11:44:41,375: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
423
+ INFO: 2024-07-14 11:44:44,557: llmtf.base.darumeru/cp_para_ru: Loading Dataset: 3.18s
424
+ INFO: 2024-07-14 11:45:27,448: llmtf.base.darumeru/ruTiE: Processing Dataset: 264.80s
425
+ INFO: 2024-07-14 11:45:27,449: llmtf.base.darumeru/ruTiE: Results for darumeru/ruTiE:
426
+ INFO: 2024-07-14 11:45:27,510: llmtf.base.darumeru/ruTiE: {'acc': 0.42093023255813955}
427
+ INFO: 2024-07-14 11:45:27,513: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
428
+ INFO: 2024-07-14 11:45:27,513: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
429
+ INFO: 2024-07-14 11:45:30,838: llmtf.base.darumeru/ruWorldTree: Loading Dataset: 3.32s
430
+ INFO: 2024-07-14 11:45:38,023: llmtf.base.darumeru/ruWorldTree: Processing Dataset: 7.18s
431
+ INFO: 2024-07-14 11:45:38,024: llmtf.base.darumeru/ruWorldTree: Results for darumeru/ruWorldTree:
432
+ INFO: 2024-07-14 11:45:38,045: llmtf.base.darumeru/ruWorldTree: {'acc': 0.8380952380952381, 'f1_macro': 0.8343115676204449}
433
+ INFO: 2024-07-14 11:45:38,046: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
434
+ INFO: 2024-07-14 11:45:38,046: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
435
+ INFO: 2024-07-14 11:45:41,831: llmtf.base.darumeru/RWSD: Loading Dataset: 3.78s
436
+ INFO: 2024-07-14 11:45:59,953: llmtf.base.darumeru/RWSD: Processing Dataset: 18.12s
437
+ INFO: 2024-07-14 11:45:59,971: llmtf.base.darumeru/RWSD: Results for darumeru/RWSD:
438
+ INFO: 2024-07-14 11:45:59,977: llmtf.base.darumeru/RWSD: {'acc': 0.49019607843137253}
439
+ INFO: 2024-07-14 11:45:59,979: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
440
+ INFO: 2024-07-14 11:45:59,979: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
441
+ INFO: 2024-07-14 11:46:14,583: llmtf.base.darumeru/USE: Loading Dataset: 14.60s
442
+ INFO: 2024-07-14 11:50:15,530: llmtf.base.darumeru/USE: Processing Dataset: 240.93s
443
+ INFO: 2024-07-14 11:50:15,533: llmtf.base.darumeru/USE: Results for darumeru/USE:
444
+ INFO: 2024-07-14 11:50:15,541: llmtf.base.darumeru/USE: {'grade_norm': 0.1019607843137255}
445
+ INFO: 2024-07-14 11:50:15,547: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
446
+ INFO: 2024-07-14 11:50:15,547: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
447
+ INFO: 2024-07-14 11:50:34,918: llmtf.base.russiannlp/rucola_custom: Loading Dataset: 19.37s
448
+ INFO: 2024-07-14 11:51:25,782: llmtf.base.nlpcoreteam/enMMLU: Processing Dataset: 985.28s
449
+ INFO: 2024-07-14 11:51:25,784: llmtf.base.nlpcoreteam/enMMLU: Results for nlpcoreteam/enMMLU:
450
+ INFO: 2024-07-14 11:51:25,832: llmtf.base.nlpcoreteam/enMMLU: metric
451
+ subject
452
+ abstract_algebra 0.310000
453
+ anatomy 0.696296
454
+ astronomy 0.697368
455
+ business_ethics 0.650000
456
+ clinical_knowledge 0.754717
457
+ college_biology 0.770833
458
+ college_chemistry 0.470000
459
+ college_computer_science 0.470000
460
+ college_mathematics 0.340000
461
+ college_medicine 0.647399
462
+ college_physics 0.500000
463
+ computer_security 0.800000
464
+ conceptual_physics 0.595745
465
+ econometrics 0.526316
466
+ electrical_engineering 0.655172
467
+ elementary_mathematics 0.441799
468
+ formal_logic 0.492063
469
+ global_facts 0.330000
470
+ high_school_biology 0.777419
471
+ high_school_chemistry 0.551724
472
+ high_school_computer_science 0.680000
473
+ high_school_european_history 0.769697
474
+ high_school_geography 0.808081
475
+ high_school_government_and_politics 0.891192
476
+ high_school_macroeconomics 0.653846
477
+ high_school_mathematics 0.392593
478
+ high_school_microeconomics 0.731092
479
+ high_school_physics 0.450331
480
+ high_school_psychology 0.849541
481
+ high_school_statistics 0.541667
482
+ high_school_us_history 0.857843
483
+ high_school_world_history 0.827004
484
+ human_aging 0.713004
485
+ human_sexuality 0.770992
486
+ international_law 0.851240
487
+ jurisprudence 0.759259
488
+ logical_fallacies 0.736196
489
+ machine_learning 0.517857
490
+ management 0.883495
491
+ marketing 0.888889
492
+ medical_genetics 0.790000
493
+ miscellaneous 0.831418
494
+ moral_disputes 0.719653
495
+ moral_scenarios 0.412291
496
+ nutrition 0.767974
497
+ philosophy 0.749196
498
+ prehistory 0.734568
499
+ professional_accounting 0.482270
500
+ professional_law 0.468709
501
+ professional_medicine 0.716912
502
+ professional_psychology 0.722222
503
+ public_relations 0.718182
504
+ security_studies 0.759184
505
+ sociology 0.865672
506
+ us_foreign_policy 0.870000
507
+ virology 0.572289
508
+ world_religions 0.818713
509
+ INFO: 2024-07-14 11:51:25,840: llmtf.base.nlpcoreteam/enMMLU: metric
510
+ subject
511
+ STEM 0.553473
512
+ humanities 0.707418
513
+ other (business, health, misc.) 0.694619
514
+ social sciences 0.763860
515
+ INFO: 2024-07-14 11:51:25,916: llmtf.base.nlpcoreteam/enMMLU: {'acc': 0.6798423546157744}
516
+ INFO: 2024-07-14 11:51:25,986: llmtf.base.evaluator: Ended eval
517
+ INFO: 2024-07-14 11:51:26,157: llmtf.base.evaluator:
518
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
519
+ 0.571 0.407 0.517 0.770 0.412 0.490 0.102 0.186 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
520
+ INFO: 2024-07-14 11:53:16,466: llmtf.base.darumeru/ruMMLU: Processing Dataset: 1215.85s
521
+ INFO: 2024-07-14 11:53:16,498: llmtf.base.darumeru/ruMMLU: Results for darumeru/ruMMLU:
522
+ INFO: 2024-07-14 11:53:16,524: llmtf.base.darumeru/ruMMLU: {'acc': 0.5003491968472513}
523
+ INFO: 2024-07-14 11:53:16,602: llmtf.base.evaluator: Ended eval
524
+ INFO: 2024-07-14 11:53:16,635: llmtf.base.evaluator:
525
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
526
+ 0.571 0.407 0.517 0.770 0.412 0.490 0.102 0.186 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
527
+ INFO: 2024-07-14 11:53:18,760: llmtf.base.russiannlp/rucola_custom: Processing Dataset: 163.84s
528
+ INFO: 2024-07-14 11:53:18,763: llmtf.base.russiannlp/rucola_custom: Results for russiannlp/rucola_custom:
529
+ INFO: 2024-07-14 11:53:18,803: llmtf.base.russiannlp/rucola_custom: {'acc': 0.7240760674560459, 'mcc': 0.36043904403572885}
530
+ INFO: 2024-07-14 11:53:18,814: llmtf.base.evaluator: Ended eval
531
+ INFO: 2024-07-14 11:53:18,828: llmtf.base.evaluator:
532
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
533
+ 0.571 0.407 0.517 0.770 0.412 0.490 0.102 0.186 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
534
+ INFO: 2024-07-14 11:59:54,999: llmtf.base.nlpcoreteam/ruMMLU: Processing Dataset: 1449.62s
535
+ INFO: 2024-07-14 11:59:55,003: llmtf.base.nlpcoreteam/ruMMLU: Results for nlpcoreteam/ruMMLU:
536
+ INFO: 2024-07-14 11:59:55,049: llmtf.base.nlpcoreteam/ruMMLU: metric
537
+ subject
538
+ abstract_algebra 0.350000
539
+ anatomy 0.444444
540
+ astronomy 0.638158
541
+ business_ethics 0.630000
542
+ clinical_knowledge 0.581132
543
+ college_biology 0.569444
544
+ college_chemistry 0.410000
545
+ college_computer_science 0.430000
546
+ college_mathematics 0.340000
547
+ college_medicine 0.549133
548
+ college_physics 0.323529
549
+ computer_security 0.700000
550
+ conceptual_physics 0.527660
551
+ econometrics 0.438596
552
+ electrical_engineering 0.537931
553
+ elementary_mathematics 0.394180
554
+ formal_logic 0.420635
555
+ global_facts 0.330000
556
+ high_school_biology 0.658065
557
+ high_school_chemistry 0.433498
558
+ high_school_computer_science 0.660000
559
+ high_school_european_history 0.727273
560
+ high_school_geography 0.691919
561
+ high_school_government_and_politics 0.683938
562
+ high_school_macroeconomics 0.548718
563
+ high_school_mathematics 0.400000
564
+ high_school_microeconomics 0.525210
565
+ high_school_physics 0.357616
566
+ high_school_psychology 0.662385
567
+ high_school_statistics 0.504630
568
+ high_school_us_history 0.705882
569
+ high_school_world_history 0.742616
570
+ human_aging 0.560538
571
+ human_sexuality 0.625954
572
+ international_law 0.743802
573
+ jurisprudence 0.666667
574
+ logical_fallacies 0.558282
575
+ machine_learning 0.526786
576
+ management 0.757282
577
+ marketing 0.709402
578
+ medical_genetics 0.620000
579
+ miscellaneous 0.629630
580
+ moral_disputes 0.598266
581
+ moral_scenarios 0.392179
582
+ nutrition 0.643791
583
+ philosophy 0.617363
584
+ prehistory 0.583333
585
+ professional_accounting 0.375887
586
+ professional_law 0.384615
587
+ professional_medicine 0.503676
588
+ professional_psychology 0.503268
589
+ public_relations 0.572727
590
+ security_studies 0.669388
591
+ sociology 0.696517
592
+ us_foreign_policy 0.780000
593
+ virology 0.500000
594
+ world_religions 0.672515
595
+ INFO: 2024-07-14 11:59:55,057: llmtf.base.nlpcoreteam/ruMMLU: metric
596
+ subject
597
+ STEM 0.486750
598
+ humanities 0.601033
599
+ other (business, health, misc.) 0.559637
600
+ social sciences 0.616552
601
+ INFO: 2024-07-14 11:59:55,069: llmtf.base.nlpcoreteam/ruMMLU: {'acc': 0.5659927988894412}
602
+ INFO: 2024-07-14 11:59:55,148: llmtf.base.evaluator: Ended eval
603
+ INFO: 2024-07-14 11:59:55,163: llmtf.base.evaluator:
604
+ mean daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
605
+ 0.571 0.407 0.517 0.770 0.412 0.490 0.102 0.186 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
606
+ INFO: 2024-07-14 12:07:14,213: llmtf.base.darumeru/cp_para_ru: Processing Dataset: 1349.64s
607
+ INFO: 2024-07-14 12:07:14,218: llmtf.base.darumeru/cp_para_ru: Results for darumeru/cp_para_ru:
608
+ INFO: 2024-07-14 12:07:14,240: llmtf.base.darumeru/cp_para_ru: {'symbol_per_token': 2.965935841307524, 'len': 0.9998800850030358, 'lcs': 0.9964476909825747}
609
+ INFO: 2024-07-14 12:07:14,242: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [128001, 198, 271]
610
+ INFO: 2024-07-14 12:07:14,242: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['\n', '\n\n']
611
+ INFO: 2024-07-14 12:07:18,918: llmtf.base.darumeru/cp_para_en: Loading Dataset: 4.68s
612
+ INFO: 2024-07-14 12:07:58,685: llmtf.base.daru/treewayabstractive: Processing Dataset: 2164.81s
613
+ INFO: 2024-07-14 12:07:58,687: llmtf.base.daru/treewayabstractive: Results for daru/treewayabstractive:
614
+ INFO: 2024-07-14 12:07:58,692: llmtf.base.daru/treewayabstractive: {'rouge1': 0.35742299153264667, 'rouge2': 0.14485242187705508}
615
+ INFO: 2024-07-14 12:07:58,696: llmtf.base.evaluator: Ended eval
616
+ INFO: 2024-07-14 12:07:58,708: llmtf.base.evaluator:
617
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
618
+ 0.600 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
619
+ INFO: 2024-07-14 12:29:54,865: llmtf.base.darumeru/cp_para_en: Processing Dataset: 1355.93s
620
+ INFO: 2024-07-14 12:29:54,869: llmtf.base.darumeru/cp_para_en: Results for darumeru/cp_para_en:
621
+ INFO: 2024-07-14 12:29:54,873: llmtf.base.darumeru/cp_para_en: {'symbol_per_token': 4.484972311760252, 'len': 0.999859659310879, 'lcs': 0.9881793213641535}
622
+ INFO: 2024-07-14 12:29:54,874: llmtf.base.evaluator: Ended eval
623
+ INFO: 2024-07-14 12:29:54,887: llmtf.base.evaluator:
624
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
625
+ 0.621 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.988 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
llmtf_eval_k5/evaluation_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ mean daru/treewayabstractive daru/treewayextractive darumeru/MultiQ darumeru/PARus darumeru/RCB darumeru/RWSD darumeru/USE darumeru/cp_para_en darumeru/cp_para_ru darumeru/cp_sent_en darumeru/cp_sent_ru darumeru/ruMMLU darumeru/ruOpenBookQA darumeru/ruTiE darumeru/ruWorldTree nlpcoreteam/enMMLU nlpcoreteam/ruMMLU russiannlp/rucola_custom
2
+ 0.621 0.251 0.407 0.517 0.770 0.412 0.490 0.102 0.988 0.996 0.998 1.000 0.500 0.707 0.421 0.836 0.680 0.566 0.542
llmtf_eval_k5/nlpcoreteam_enMMLU.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6cf2bc6cc10bf09a17a9138708c5ef1e205619fdb530438e9dc12a5434e972
3
+ size 73160663
llmtf_eval_k5/nlpcoreteam_enMMLU_params.jsonl ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_generation_config": null,
3
+ "model_params": {
4
+ "model_name_or_path": "NousResearch/Meta-Llama-3-8B",
5
+ "generation_config": {
6
+ "bos_token_id": 128000,
7
+ "do_sample": true,
8
+ "eos_token_id": [
9
+ 128001,
10
+ 198,
11
+ 271
12
+ ],
13
+ "max_length": 8192,
14
+ "max_new_tokens": 64,
15
+ "pad_token_id": 128001,
16
+ "stop_strings": [
17
+ "\n",
18
+ "\n\n"
19
+ ],
20
+ "temperature": 0.1,
21
+ "top_k": 40,
22
+ "top_p": 0.9,
23
+ "transformers_version": "4.38.2",
24
+ "trust_remote_code": [
25
+ false
26
+ ]
27
+ },
28
+ "conversation_template": {
29
+ "system_message_template": "{content}\n",
30
+ "user_message_template": "{content}\n",
31
+ "bot_message_template": "{content}\n\n",
32
+ "bot_message_template_incomplete": "{content}",
33
+ "user_role": "user",
34
+ "bot_role": "bot",
35
+ "system_role": "system",
36
+ "suffix": "",
37
+ "add_special_tokens": false,
38
+ "eos_token": [
39
+ "\n",
40
+ "\n\n"
41
+ ],
42
+ "global_prefix": "<|begin_of_text|>"
43
+ },
44
+ "load_in_8bit": false,
45
+ "torch_dtype": "auto",
46
+ "use_flash_attention_2": true,
47
+ "device_map": "cuda:0",
48
+ "use_fast_tokenizer": true,
49
+ "leading_space": false,
50
+ "space_token": null,
51
+ "trust_remote_code": [
52
+ false
53
+ ],
54
+ "max_model_len": 8192
55
+ },
56
+ "task_params": {
57
+ "max_len": 4000,
58
+ "few_shot_count": 5,
59
+ "batch_size": 8,
60
+ "max_sample_per_dataset": 10000000000000,
61
+ "method": "calculate_tokens_proba"
62
+ }
63
+ }