edward77sas commited on
Commit
789127a
·
1 Parent(s): ed8bdb3

Initial commit of evals folder

Browse files
Files changed (16) hide show
  1. evals/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-19T18-06-22.517116.json +98 -0
  2. evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-18T17-53-50.666224.json +98 -0
  3. evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-19T18-45-29.108131.json +98 -0
  4. evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-18T17-30-30.453980.json +98 -0
  5. evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-19T19-51-58.260659.json +98 -0
  6. evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-12-57.684927.json +98 -0
  7. evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-54-25.306795.json +98 -0
  8. evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-09-45.381308.json +98 -0
  9. evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-15-43.577739.json +98 -0
  10. evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T22-45-57.990877.json +98 -0
  11. evals/Qwen2.5-1.5B-Open-R1-Distill-0318/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318/results_2025-03-19T10-18-52.986335.json +98 -0
  12. evals/Qwen2.5-1.5B-Open-R1-Distill/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill/results_2025-03-17T10-59-16.909691.json +98 -0
  13. evals/Qwen2.5-7B-Instruct/results/Qwen/Qwen2.5-7B-Instruct/results_2025-03-19T20-10-18.884787.json +98 -0
  14. evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-18T16-35-47.509906.json +98 -0
  15. evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-19T16-22-09.954754.json +98 -0
  16. evals/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-17T10-41-38.230702.json +98 -0
evals/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-19T18-06-22.517116.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1987404.563364426,
9
+ "end_time": 1988203.22582261,
10
+ "total_evaluation_time_secondes": "798.6624581841752",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.866,
19
+ "extractive_match_stderr": 0.015249692003854488
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.866,
23
+ "extractive_match_stderr": 0.015249692003854488
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "63c902dbdbaf1552",
74
+ "hash_input_tokens": "2af397a095a31139",
75
+ "hash_cont_tokens": "bcb223eaec255944"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "3ceaaade5cf43911",
89
+ "hash_input_tokens": "c663dbac8a64d3e4",
90
+ "hash_cont_tokens": "18b7de9c0f3de706"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-18T17-53-50.666224.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1899825.923879466,
9
+ "end_time": 1901051.417550421,
10
+ "total_evaluation_time_secondes": "1225.4936709550675",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.5333333333333333,
19
+ "extractive_match_stderr": 0.09264111117062017
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.5333333333333333,
23
+ "extractive_match_stderr": 0.09264111117062017
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "253167becf0dfed7",
74
+ "hash_input_tokens": "bf1cc75b5f12dfb8",
75
+ "hash_cont_tokens": "3ac3e0e58dc8d1f5"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "84ff409b6bbf7cc0",
89
+ "hash_input_tokens": "9a8c7e54ce09af84",
90
+ "hash_cont_tokens": "0c7bfaa06d00eaa4"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-19T18-45-29.108131.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1989411.904871259,
9
+ "end_time": 1990549.836358162,
10
+ "total_evaluation_time_secondes": "1137.9314869032241",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.934,
19
+ "extractive_match_stderr": 0.011114633153652964
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.934,
23
+ "extractive_match_stderr": 0.011114633153652964
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "63c902dbdbaf1552",
74
+ "hash_input_tokens": "2af397a095a31139",
75
+ "hash_cont_tokens": "cac4733bad35e8e8"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "3ceaaade5cf43911",
89
+ "hash_input_tokens": "c663dbac8a64d3e4",
90
+ "hash_cont_tokens": "ca1270e8aea98798"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-18T17-30-30.453980.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1898983.620888934,
9
+ "end_time": 1899651.212244441,
10
+ "total_evaluation_time_secondes": "667.5913555070292",
11
+ "model_name": "Qwen/Qwen2.5-1.5B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "cb72ff864358b2c0"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "cf44ad0095a7289d"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-19T19-51-58.260659.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1993820.241350481,
9
+ "end_time": 1994539.00358457,
10
+ "total_evaluation_time_secondes": "718.7622340889648",
11
+ "model_name": "Qwen/Qwen2.5-1.5B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.554,
19
+ "extractive_match_stderr": 0.022252153078595897
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.554,
23
+ "extractive_match_stderr": 0.022252153078595897
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "8ea39bd2d4645692",
74
+ "hash_input_tokens": "b50dbed21f398c5a",
75
+ "hash_cont_tokens": "aab8acf9493cc551"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "bafb225051f36263",
89
+ "hash_input_tokens": "4eb9b54e733b7bfd",
90
+ "hash_cont_tokens": "2473c03a869943a2"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-12-57.684927.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1894144.109569344,
9
+ "end_time": 1894998.424300142,
10
+ "total_evaluation_time_secondes": "854.3147307981271",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "502b688747b94043"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "fd9582dd0a52a368"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-54-25.306795.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1896943.276604601,
9
+ "end_time": 1897486.051500043,
10
+ "total_evaluation_time_secondes": "542.7748954419512",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "b7d0decfbb6478c5"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "279f0b9068e707a9"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-09-45.381308.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1898156.252841867,
9
+ "end_time": 1898406.136190798,
10
+ "total_evaluation_time_secondes": "249.88334893085994",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "e75f5cf9b76452e2"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "40ea583099edd587"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-15-43.577739.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1898679.514228297,
9
+ "end_time": 1898764.340840129,
10
+ "total_evaluation_time_secondes": "84.82661183201708",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "72855190a84189eb"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "13f647a3d53749da"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T22-45-57.990877.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1917493.299303261,
9
+ "end_time": 1918578.567845826,
10
+ "total_evaluation_time_secondes": "1085.2685425649397",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.422,
19
+ "extractive_match_stderr": 0.022109039310618552
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.422,
23
+ "extractive_match_stderr": 0.022109039310618552
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "8ea39bd2d4645692",
74
+ "hash_input_tokens": "b50dbed21f398c5a",
75
+ "hash_cont_tokens": "0789ce17f6800d1e"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "bafb225051f36263",
89
+ "hash_input_tokens": "4eb9b54e733b7bfd",
90
+ "hash_cont_tokens": "a00be3bbf7712a23"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill-0318/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318/results_2025-03-19T10-18-52.986335.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1956474.007134297,
9
+ "end_time": 1960153.23434271,
10
+ "total_evaluation_time_secondes": "3679.2272084131837",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.506,
19
+ "extractive_match_stderr": 0.022381462412439324
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.506,
23
+ "extractive_match_stderr": 0.022381462412439324
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "8ea39bd2d4645692",
74
+ "hash_input_tokens": "b50dbed21f398c5a",
75
+ "hash_cont_tokens": "c7da8253b1e8072e"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "bafb225051f36263",
89
+ "hash_input_tokens": "4eb9b54e733b7bfd",
90
+ "hash_cont_tokens": "cc2b459f237a7b5b"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-1.5B-Open-R1-Distill/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill/results_2025-03-17T10-59-16.909691.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1789034.454502437,
9
+ "end_time": 1789777.674175623,
10
+ "total_evaluation_time_secondes": "743.2196731860749",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.0,
19
+ "extractive_match_stderr": 0.0
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.0,
23
+ "extractive_match_stderr": 0.0
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "098eb358c8c67ace"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "80fb70da799d7afb"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-7B-Instruct/results/Qwen/Qwen2.5-7B-Instruct/results_2025-03-19T20-10-18.884787.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1995311.985713479,
9
+ "end_time": 1995639.643426115,
10
+ "total_evaluation_time_secondes": "327.657712635817",
11
+ "model_name": "Qwen/Qwen2.5-7B-Instruct",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.756,
19
+ "extractive_match_stderr": 0.01922673489361458
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.756,
23
+ "extractive_match_stderr": 0.01922673489361458
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "8ea39bd2d4645692",
74
+ "hash_input_tokens": "b50dbed21f398c5a",
75
+ "hash_cont_tokens": "36dcf14f9584caec"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "bafb225051f36263",
89
+ "hash_input_tokens": "4eb9b54e733b7bfd",
90
+ "hash_cont_tokens": "7c39254cae6da7e1"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-18T16-35-47.509906.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1895055.944744457,
9
+ "end_time": 1896368.24957376,
10
+ "total_evaluation_time_secondes": "1312.3048293029424",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.1,
19
+ "extractive_match_stderr": 0.055708601453115555
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.1,
23
+ "extractive_match_stderr": 0.055708601453115555
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "d1829811f23cf34b",
74
+ "hash_input_tokens": "7211f832bf7f8d79",
75
+ "hash_cont_tokens": "c0efce6be7426f22"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "09fe8694776a7143",
89
+ "hash_input_tokens": "b52bc353fe82900e",
90
+ "hash_cont_tokens": "e993bfe4f585739c"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-19T16-22-09.954754.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1971587.064608292,
9
+ "end_time": 1981950.716929167,
10
+ "total_evaluation_time_secondes": "10363.652320875088",
11
+ "model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|math_500|0": {
18
+ "extractive_match": 0.748,
19
+ "extractive_match_stderr": 0.01943572728224954
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.748,
23
+ "extractive_match_stderr": 0.01943572728224954
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|math_500|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|math_500": {
31
+ "name": "math_500",
32
+ "prompt_function": "math_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/MATH-500",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "test"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "test"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 500,
64
+ "effective_num_docs": 500,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|math_500|0": {
71
+ "hashes": {
72
+ "hash_examples": "adf0cc8311011db2",
73
+ "hash_full_prompts": "8ea39bd2d4645692",
74
+ "hash_input_tokens": "b50dbed21f398c5a",
75
+ "hash_cont_tokens": "224a24b9c50234b7"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 500,
79
+ "padded": 0,
80
+ "non_padded": 500,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "bfaad1993ff37a60",
88
+ "hash_full_prompts": "bafb225051f36263",
89
+ "hash_input_tokens": "4eb9b54e733b7bfd",
90
+ "hash_cont_tokens": "94cd7676981f6c33"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 500,
94
+ "padded": 0,
95
+ "non_padded": 500,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }
evals/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-17T10-41-38.230702.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": -1,
6
+ "max_samples": null,
7
+ "job_id": 0,
8
+ "start_time": 1788110.135742885,
9
+ "end_time": 1788718.983399756,
10
+ "total_evaluation_time_secondes": "608.8476568709593",
11
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
+ "model_sha": "",
13
+ "model_dtype": null,
14
+ "model_size": null
15
+ },
16
+ "results": {
17
+ "custom|aime24|0": {
18
+ "extractive_match": 0.3,
19
+ "extractive_match_stderr": 0.0850962943396763
20
+ },
21
+ "all": {
22
+ "extractive_match": 0.3,
23
+ "extractive_match_stderr": 0.0850962943396763
24
+ }
25
+ },
26
+ "versions": {
27
+ "custom|aime24|0": 1
28
+ },
29
+ "config_tasks": {
30
+ "custom|aime24": {
31
+ "name": "aime24",
32
+ "prompt_function": "aime_prompt_fn",
33
+ "hf_repo": "HuggingFaceH4/aime_2024",
34
+ "hf_subset": "default",
35
+ "metric": [
36
+ {
37
+ "metric_name": "extractive_match",
38
+ "higher_is_better": true,
39
+ "category": "3",
40
+ "use_case": "1",
41
+ "sample_level_fn": "sample_level_fn",
42
+ "corpus_level_fn": "mean"
43
+ }
44
+ ],
45
+ "hf_revision": null,
46
+ "hf_filter": null,
47
+ "hf_avail_splits": [
48
+ "train"
49
+ ],
50
+ "trust_dataset": false,
51
+ "evaluation_splits": [
52
+ "train"
53
+ ],
54
+ "few_shots_split": null,
55
+ "few_shots_select": null,
56
+ "generation_size": 32768,
57
+ "generation_grammar": null,
58
+ "stop_sequence": [],
59
+ "num_samples": null,
60
+ "suite": [
61
+ "custom"
62
+ ],
63
+ "original_num_docs": 30,
64
+ "effective_num_docs": 30,
65
+ "must_remove_duplicate_docs": false,
66
+ "version": 1
67
+ }
68
+ },
69
+ "summary_tasks": {
70
+ "custom|aime24|0": {
71
+ "hashes": {
72
+ "hash_examples": "ddec8fc79d0a014b",
73
+ "hash_full_prompts": "253167becf0dfed7",
74
+ "hash_input_tokens": "bf1cc75b5f12dfb8",
75
+ "hash_cont_tokens": "e14c52e3f66b52bc"
76
+ },
77
+ "truncated": 0,
78
+ "non_truncated": 30,
79
+ "padded": 0,
80
+ "non_padded": 30,
81
+ "effective_few_shots": 0.0,
82
+ "num_truncated_few_shots": 0
83
+ }
84
+ },
85
+ "summary_general": {
86
+ "hashes": {
87
+ "hash_examples": "c903e836a519cf98",
88
+ "hash_full_prompts": "84ff409b6bbf7cc0",
89
+ "hash_input_tokens": "9a8c7e54ce09af84",
90
+ "hash_cont_tokens": "4710145804b70924"
91
+ },
92
+ "truncated": 0,
93
+ "non_truncated": 30,
94
+ "padded": 0,
95
+ "non_padded": 30,
96
+ "num_truncated_few_shots": 0
97
+ }
98
+ }