Commit
·
789127a
1
Parent(s):
ed8bdb3
Initial commit of evals folder
Browse files- evals/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-19T18-06-22.517116.json +98 -0
- evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-18T17-53-50.666224.json +98 -0
- evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-19T18-45-29.108131.json +98 -0
- evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-18T17-30-30.453980.json +98 -0
- evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-19T19-51-58.260659.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-12-57.684927.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-54-25.306795.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-09-45.381308.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-15-43.577739.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T22-45-57.990877.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill-0318/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318/results_2025-03-19T10-18-52.986335.json +98 -0
- evals/Qwen2.5-1.5B-Open-R1-Distill/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill/results_2025-03-17T10-59-16.909691.json +98 -0
- evals/Qwen2.5-7B-Instruct/results/Qwen/Qwen2.5-7B-Instruct/results_2025-03-19T20-10-18.884787.json +98 -0
- evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-18T16-35-47.509906.json +98 -0
- evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-19T16-22-09.954754.json +98 -0
- evals/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-17T10-41-38.230702.json +98 -0
evals/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-19T18-06-22.517116.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1987404.563364426,
|
| 9 |
+
"end_time": 1988203.22582261,
|
| 10 |
+
"total_evaluation_time_secondes": "798.6624581841752",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.866,
|
| 19 |
+
"extractive_match_stderr": 0.015249692003854488
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.866,
|
| 23 |
+
"extractive_match_stderr": 0.015249692003854488
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "63c902dbdbaf1552",
|
| 74 |
+
"hash_input_tokens": "2af397a095a31139",
|
| 75 |
+
"hash_cont_tokens": "bcb223eaec255944"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "3ceaaade5cf43911",
|
| 89 |
+
"hash_input_tokens": "c663dbac8a64d3e4",
|
| 90 |
+
"hash_cont_tokens": "18b7de9c0f3de706"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-18T17-53-50.666224.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1899825.923879466,
|
| 9 |
+
"end_time": 1901051.417550421,
|
| 10 |
+
"total_evaluation_time_secondes": "1225.4936709550675",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.5333333333333333,
|
| 19 |
+
"extractive_match_stderr": 0.09264111117062017
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.5333333333333333,
|
| 23 |
+
"extractive_match_stderr": 0.09264111117062017
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "253167becf0dfed7",
|
| 74 |
+
"hash_input_tokens": "bf1cc75b5f12dfb8",
|
| 75 |
+
"hash_cont_tokens": "3ac3e0e58dc8d1f5"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "84ff409b6bbf7cc0",
|
| 89 |
+
"hash_input_tokens": "9a8c7e54ce09af84",
|
| 90 |
+
"hash_cont_tokens": "0c7bfaa06d00eaa4"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/DeepSeek-R1-Distill-Qwen-7B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/results_2025-03-19T18-45-29.108131.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1989411.904871259,
|
| 9 |
+
"end_time": 1990549.836358162,
|
| 10 |
+
"total_evaluation_time_secondes": "1137.9314869032241",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.934,
|
| 19 |
+
"extractive_match_stderr": 0.011114633153652964
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.934,
|
| 23 |
+
"extractive_match_stderr": 0.011114633153652964
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "63c902dbdbaf1552",
|
| 74 |
+
"hash_input_tokens": "2af397a095a31139",
|
| 75 |
+
"hash_cont_tokens": "cac4733bad35e8e8"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "3ceaaade5cf43911",
|
| 89 |
+
"hash_input_tokens": "c663dbac8a64d3e4",
|
| 90 |
+
"hash_cont_tokens": "ca1270e8aea98798"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-18T17-30-30.453980.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1898983.620888934,
|
| 9 |
+
"end_time": 1899651.212244441,
|
| 10 |
+
"total_evaluation_time_secondes": "667.5913555070292",
|
| 11 |
+
"model_name": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "cb72ff864358b2c0"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "cf44ad0095a7289d"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Instruct/results/Qwen/Qwen2.5-1.5B-Instruct/results_2025-03-19T19-51-58.260659.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1993820.241350481,
|
| 9 |
+
"end_time": 1994539.00358457,
|
| 10 |
+
"total_evaluation_time_secondes": "718.7622340889648",
|
| 11 |
+
"model_name": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.554,
|
| 19 |
+
"extractive_match_stderr": 0.022252153078595897
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.554,
|
| 23 |
+
"extractive_match_stderr": 0.022252153078595897
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "8ea39bd2d4645692",
|
| 74 |
+
"hash_input_tokens": "b50dbed21f398c5a",
|
| 75 |
+
"hash_cont_tokens": "aab8acf9493cc551"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "bafb225051f36263",
|
| 89 |
+
"hash_input_tokens": "4eb9b54e733b7bfd",
|
| 90 |
+
"hash_cont_tokens": "2473c03a869943a2"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-12-57.684927.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1894144.109569344,
|
| 9 |
+
"end_time": 1894998.424300142,
|
| 10 |
+
"total_evaluation_time_secondes": "854.3147307981271",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "502b688747b94043"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "fd9582dd0a52a368"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T16-54-25.306795.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1896943.276604601,
|
| 9 |
+
"end_time": 1897486.051500043,
|
| 10 |
+
"total_evaluation_time_secondes": "542.7748954419512",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "b7d0decfbb6478c5"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "279f0b9068e707a9"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-09-45.381308.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1898156.252841867,
|
| 9 |
+
"end_time": 1898406.136190798,
|
| 10 |
+
"total_evaluation_time_secondes": "249.88334893085994",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "e75f5cf9b76452e2"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "40ea583099edd587"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T17-15-43.577739.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1898679.514228297,
|
| 9 |
+
"end_time": 1898764.340840129,
|
| 10 |
+
"total_evaluation_time_secondes": "84.82661183201708",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "72855190a84189eb"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "13f647a3d53749da"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317/results_2025-03-18T22-45-57.990877.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1917493.299303261,
|
| 9 |
+
"end_time": 1918578.567845826,
|
| 10 |
+
"total_evaluation_time_secondes": "1085.2685425649397",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.422,
|
| 19 |
+
"extractive_match_stderr": 0.022109039310618552
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.422,
|
| 23 |
+
"extractive_match_stderr": 0.022109039310618552
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "8ea39bd2d4645692",
|
| 74 |
+
"hash_input_tokens": "b50dbed21f398c5a",
|
| 75 |
+
"hash_cont_tokens": "0789ce17f6800d1e"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "bafb225051f36263",
|
| 89 |
+
"hash_input_tokens": "4eb9b54e733b7bfd",
|
| 90 |
+
"hash_cont_tokens": "a00be3bbf7712a23"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill-0318/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318/results_2025-03-19T10-18-52.986335.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1956474.007134297,
|
| 9 |
+
"end_time": 1960153.23434271,
|
| 10 |
+
"total_evaluation_time_secondes": "3679.2272084131837",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill-0318",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.506,
|
| 19 |
+
"extractive_match_stderr": 0.022381462412439324
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.506,
|
| 23 |
+
"extractive_match_stderr": 0.022381462412439324
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "8ea39bd2d4645692",
|
| 74 |
+
"hash_input_tokens": "b50dbed21f398c5a",
|
| 75 |
+
"hash_cont_tokens": "c7da8253b1e8072e"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "bafb225051f36263",
|
| 89 |
+
"hash_input_tokens": "4eb9b54e733b7bfd",
|
| 90 |
+
"hash_cont_tokens": "cc2b459f237a7b5b"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-1.5B-Open-R1-Distill/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill/results_2025-03-17T10-59-16.909691.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1789034.454502437,
|
| 9 |
+
"end_time": 1789777.674175623,
|
| 10 |
+
"total_evaluation_time_secondes": "743.2196731860749",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-1.5B-Open-R1-Distill",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.0,
|
| 19 |
+
"extractive_match_stderr": 0.0
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.0,
|
| 23 |
+
"extractive_match_stderr": 0.0
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "098eb358c8c67ace"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "80fb70da799d7afb"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-7B-Instruct/results/Qwen/Qwen2.5-7B-Instruct/results_2025-03-19T20-10-18.884787.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1995311.985713479,
|
| 9 |
+
"end_time": 1995639.643426115,
|
| 10 |
+
"total_evaluation_time_secondes": "327.657712635817",
|
| 11 |
+
"model_name": "Qwen/Qwen2.5-7B-Instruct",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.756,
|
| 19 |
+
"extractive_match_stderr": 0.01922673489361458
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.756,
|
| 23 |
+
"extractive_match_stderr": 0.01922673489361458
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "8ea39bd2d4645692",
|
| 74 |
+
"hash_input_tokens": "b50dbed21f398c5a",
|
| 75 |
+
"hash_cont_tokens": "36dcf14f9584caec"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "bafb225051f36263",
|
| 89 |
+
"hash_input_tokens": "4eb9b54e733b7bfd",
|
| 90 |
+
"hash_cont_tokens": "7c39254cae6da7e1"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-18T16-35-47.509906.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1895055.944744457,
|
| 9 |
+
"end_time": 1896368.24957376,
|
| 10 |
+
"total_evaluation_time_secondes": "1312.3048293029424",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.1,
|
| 19 |
+
"extractive_match_stderr": 0.055708601453115555
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.1,
|
| 23 |
+
"extractive_match_stderr": 0.055708601453115555
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "d1829811f23cf34b",
|
| 74 |
+
"hash_input_tokens": "7211f832bf7f8d79",
|
| 75 |
+
"hash_cont_tokens": "c0efce6be7426f22"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "09fe8694776a7143",
|
| 89 |
+
"hash_input_tokens": "b52bc353fe82900e",
|
| 90 |
+
"hash_cont_tokens": "e993bfe4f585739c"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/Qwen2.5-7B-Open-R1-Distill-0317/results/_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317/results_2025-03-19T16-22-09.954754.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1971587.064608292,
|
| 9 |
+
"end_time": 1981950.716929167,
|
| 10 |
+
"total_evaluation_time_secondes": "10363.652320875088",
|
| 11 |
+
"model_name": "_mnt_openr1_data_disk_OpenR1_PT_open-r1_data_Qwen2.5-7B-Open-R1-Distill-0317",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|math_500|0": {
|
| 18 |
+
"extractive_match": 0.748,
|
| 19 |
+
"extractive_match_stderr": 0.01943572728224954
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.748,
|
| 23 |
+
"extractive_match_stderr": 0.01943572728224954
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|math_500|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|math_500": {
|
| 31 |
+
"name": "math_500",
|
| 32 |
+
"prompt_function": "math_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/MATH-500",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"test"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"test"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 500,
|
| 64 |
+
"effective_num_docs": 500,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|math_500|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "adf0cc8311011db2",
|
| 73 |
+
"hash_full_prompts": "8ea39bd2d4645692",
|
| 74 |
+
"hash_input_tokens": "b50dbed21f398c5a",
|
| 75 |
+
"hash_cont_tokens": "224a24b9c50234b7"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 500,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 500,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "bfaad1993ff37a60",
|
| 88 |
+
"hash_full_prompts": "bafb225051f36263",
|
| 89 |
+
"hash_input_tokens": "4eb9b54e733b7bfd",
|
| 90 |
+
"hash_cont_tokens": "94cd7676981f6c33"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 500,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 500,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|
evals/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/results_2025-03-17T10-41-38.230702.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "?",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"override_batch_size": -1,
|
| 6 |
+
"max_samples": null,
|
| 7 |
+
"job_id": 0,
|
| 8 |
+
"start_time": 1788110.135742885,
|
| 9 |
+
"end_time": 1788718.983399756,
|
| 10 |
+
"total_evaluation_time_secondes": "608.8476568709593",
|
| 11 |
+
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
| 12 |
+
"model_sha": "",
|
| 13 |
+
"model_dtype": null,
|
| 14 |
+
"model_size": null
|
| 15 |
+
},
|
| 16 |
+
"results": {
|
| 17 |
+
"custom|aime24|0": {
|
| 18 |
+
"extractive_match": 0.3,
|
| 19 |
+
"extractive_match_stderr": 0.0850962943396763
|
| 20 |
+
},
|
| 21 |
+
"all": {
|
| 22 |
+
"extractive_match": 0.3,
|
| 23 |
+
"extractive_match_stderr": 0.0850962943396763
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"versions": {
|
| 27 |
+
"custom|aime24|0": 1
|
| 28 |
+
},
|
| 29 |
+
"config_tasks": {
|
| 30 |
+
"custom|aime24": {
|
| 31 |
+
"name": "aime24",
|
| 32 |
+
"prompt_function": "aime_prompt_fn",
|
| 33 |
+
"hf_repo": "HuggingFaceH4/aime_2024",
|
| 34 |
+
"hf_subset": "default",
|
| 35 |
+
"metric": [
|
| 36 |
+
{
|
| 37 |
+
"metric_name": "extractive_match",
|
| 38 |
+
"higher_is_better": true,
|
| 39 |
+
"category": "3",
|
| 40 |
+
"use_case": "1",
|
| 41 |
+
"sample_level_fn": "sample_level_fn",
|
| 42 |
+
"corpus_level_fn": "mean"
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"hf_revision": null,
|
| 46 |
+
"hf_filter": null,
|
| 47 |
+
"hf_avail_splits": [
|
| 48 |
+
"train"
|
| 49 |
+
],
|
| 50 |
+
"trust_dataset": false,
|
| 51 |
+
"evaluation_splits": [
|
| 52 |
+
"train"
|
| 53 |
+
],
|
| 54 |
+
"few_shots_split": null,
|
| 55 |
+
"few_shots_select": null,
|
| 56 |
+
"generation_size": 32768,
|
| 57 |
+
"generation_grammar": null,
|
| 58 |
+
"stop_sequence": [],
|
| 59 |
+
"num_samples": null,
|
| 60 |
+
"suite": [
|
| 61 |
+
"custom"
|
| 62 |
+
],
|
| 63 |
+
"original_num_docs": 30,
|
| 64 |
+
"effective_num_docs": 30,
|
| 65 |
+
"must_remove_duplicate_docs": false,
|
| 66 |
+
"version": 1
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"summary_tasks": {
|
| 70 |
+
"custom|aime24|0": {
|
| 71 |
+
"hashes": {
|
| 72 |
+
"hash_examples": "ddec8fc79d0a014b",
|
| 73 |
+
"hash_full_prompts": "253167becf0dfed7",
|
| 74 |
+
"hash_input_tokens": "bf1cc75b5f12dfb8",
|
| 75 |
+
"hash_cont_tokens": "e14c52e3f66b52bc"
|
| 76 |
+
},
|
| 77 |
+
"truncated": 0,
|
| 78 |
+
"non_truncated": 30,
|
| 79 |
+
"padded": 0,
|
| 80 |
+
"non_padded": 30,
|
| 81 |
+
"effective_few_shots": 0.0,
|
| 82 |
+
"num_truncated_few_shots": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_general": {
|
| 86 |
+
"hashes": {
|
| 87 |
+
"hash_examples": "c903e836a519cf98",
|
| 88 |
+
"hash_full_prompts": "84ff409b6bbf7cc0",
|
| 89 |
+
"hash_input_tokens": "9a8c7e54ce09af84",
|
| 90 |
+
"hash_cont_tokens": "4710145804b70924"
|
| 91 |
+
},
|
| 92 |
+
"truncated": 0,
|
| 93 |
+
"non_truncated": 30,
|
| 94 |
+
"padded": 0,
|
| 95 |
+
"non_padded": 30,
|
| 96 |
+
"num_truncated_few_shots": 0
|
| 97 |
+
}
|
| 98 |
+
}
|