Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results/global_step_30/mmlu_stem/test_abel_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_35/mmlu_stem/test_abel_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_5/gsm8k/test_abel_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_5/minerva_math/test_abel_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_5/olympiadbench/test_abel_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/eval_results.csv +9 -0
- eval_results_avg16/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg16/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +17 -0
- eval_results_avg16/plots/eval_results_avg16_acc_keywords.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_acc_pass_acc.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_acc_tokens.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_avg_stop_tokens.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_box_ratio_and_token_length.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_clip_ratio.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_correct_tokens.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_repeat_ratio_and_token_length.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_tokens_keywords.png +0 -0
- eval_results_avg16/plots/eval_results_avg16_wrong_tokens.png +0 -0
- eval_results_avg32/eval_results.csv +9 -0
- eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_15/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_15/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results/global_step_30/mmlu_stem/test_abel_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_35/mmlu_stem/test_abel_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_5/gsm8k/test_abel_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_5/minerva_math/test_abel_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_5/olympiadbench/test_abel_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/eval_results.csv
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results_avg16-global_step_0,7.0,37.5,1237.0,0.25,188.0,1263.8974358974358,0.025,858.5384615384615,0.975,0.825,0.6,7.0,37.5,1237.0,0.25,188.0,1263.8974358974358,0.025,858.5384615384615,0.975,0.825,0.6
|
| 3 |
+
eval_results_avg16-global_step_5,5.8,40.0,1090.675,0.15,562.0,1104.2307692307693,0.025,708.3589743589744,0.975,0.875,0.65,5.8,40.0,1090.675,0.15,562.0,1104.2307692307693,0.025,708.3589743589744,0.975,0.875,0.65
|
| 4 |
+
eval_results_avg16-global_step_10,7.2,52.5,757.525,0.225,581.6666666666666,771.7837837837837,0.0,757.525,1.0,0.875,0.55,7.2,52.5,757.525,0.225,581.6666666666666,771.7837837837837,0.0,757.525,1.0,0.875,0.55
|
| 5 |
+
eval_results_avg16-global_step_15,9.8,55.0,692.975,0.225,708.25,691.2777777777778,0.0,692.975,1.0,0.925,0.35,9.8,55.0,692.975,0.225,708.25,691.2777777777778,0.0,692.975,1.0,0.925,0.35
|
| 6 |
+
eval_results_avg16-global_step_20,11.1,55.0,760.4,0.375,660.3333333333334,768.5135135135135,0.0,760.4,1.0,0.975,0.35,11.1,55.0,760.4,0.375,660.3333333333334,768.5135135135135,0.0,760.4,1.0,0.975,0.35
|
| 7 |
+
eval_results_avg16-global_step_25,10.9,52.5,1488.45,0.25,741.0,1549.054054054054,0.05,726.8684210526316,0.95,0.95,0.35,10.9,52.5,1488.45,0.25,741.0,1549.054054054054,0.05,726.8684210526316,0.95,0.95,0.35
|
| 8 |
+
eval_results_avg16-global_step_30,11.6,52.5,800.5,0.3,720.0,807.027027027027,0.0,800.5,1.0,1.0,0.475,11.6,52.5,800.5,0.3,720.0,807.027027027027,0.0,800.5,1.0,1.0,0.475
|
| 9 |
+
eval_results_avg16-global_step_35,12.0,60.0,704.325,0.225,534.25,723.2222222222222,0.0,704.325,1.0,1.0,0.375,12.0,60.0,704.325,0.225,534.25,723.2222222222222,0.0,704.325,1.0,1.0,0.375
|
eval_results_avg16/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 7.0,
|
| 7 |
+
"pass_acc": 37.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.0,
|
| 10 |
+
"2": 12.2,
|
| 11 |
+
"4": 19.6,
|
| 12 |
+
"8": 29.0,
|
| 13 |
+
"16": 37.5
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 168.278502702713,
|
| 16 |
+
"time_use_in_minite": "2:48"
|
| 17 |
+
}
|
eval_results_avg16/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 7.2,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.2,
|
| 10 |
+
"2": 12.7,
|
| 11 |
+
"4": 20.9,
|
| 12 |
+
"8": 32.9,
|
| 13 |
+
"16": 52.5
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 148.89572286605835,
|
| 16 |
+
"time_use_in_minite": "2:28"
|
| 17 |
+
}
|
eval_results_avg16/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 9.8,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 9.8,
|
| 10 |
+
"2": 16.4,
|
| 11 |
+
"4": 25.3,
|
| 12 |
+
"8": 37.5,
|
| 13 |
+
"16": 55.0
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 151.3860239982605,
|
| 16 |
+
"time_use_in_minite": "2:31"
|
| 17 |
+
}
|
eval_results_avg16/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 11.1,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 11.1,
|
| 10 |
+
"2": 18.3,
|
| 11 |
+
"4": 27.7,
|
| 12 |
+
"8": 40.2,
|
| 13 |
+
"16": 55.0
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 135.96930646896362,
|
| 16 |
+
"time_use_in_minite": "2:15"
|
| 17 |
+
}
|
eval_results_avg16/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.9,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.9,
|
| 10 |
+
"2": 17.1,
|
| 11 |
+
"4": 25.5,
|
| 12 |
+
"8": 36.9,
|
| 13 |
+
"16": 52.5
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 152.54610538482666,
|
| 16 |
+
"time_use_in_minite": "2:32"
|
| 17 |
+
}
|
eval_results_avg16/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 11.6,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 11.6,
|
| 10 |
+
"2": 18.3,
|
| 11 |
+
"4": 27.9,
|
| 12 |
+
"8": 39.9,
|
| 13 |
+
"16": 52.5
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 144.6851499080658,
|
| 16 |
+
"time_use_in_minite": "2:24"
|
| 17 |
+
}
|
eval_results_avg16/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 12.0,
|
| 7 |
+
"pass_acc": 60.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 12.0,
|
| 10 |
+
"2": 19.0,
|
| 11 |
+
"4": 28.0,
|
| 12 |
+
"8": 41.0,
|
| 13 |
+
"16": 60.0
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 135.14728951454163,
|
| 16 |
+
"time_use_in_minite": "2:15"
|
| 17 |
+
}
|
eval_results_avg16/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg16/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 640,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 5.8,
|
| 7 |
+
"pass_acc": 40.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 5.8,
|
| 10 |
+
"2": 10.3,
|
| 11 |
+
"4": 17.2,
|
| 12 |
+
"8": 27.0,
|
| 13 |
+
"16": 40.0
|
| 14 |
+
},
|
| 15 |
+
"time_use_in_second": 163.98140692710876,
|
| 16 |
+
"time_use_in_minite": "2:43"
|
| 17 |
+
}
|
eval_results_avg16/plots/eval_results_avg16_acc_keywords.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_acc_pass_acc.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_acc_tokens.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_avg_stop_tokens.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_box_ratio_and_token_length.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_clip_ratio.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_correct_tokens.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_repeat_ratio_and_token_length.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_tokens_keywords.png
ADDED
|
eval_results_avg16/plots/eval_results_avg16_wrong_tokens.png
ADDED
|
eval_results_avg32/eval_results.csv
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results_avg32-global_step_0,0.6,13.3,1244.9,0.3333333333333333,0.0,1244.9,0.0,1244.9,1.0,0.8666666666666667,0.5333333333333333,0.2,6.7,927.2,0.36666666666666664,0,927.2,0.0,927.2,1.0,0.9,0.7333333333333333,0.4,10.0,1086.0500000000002,0.35,0.0,1086.0500000000002,0.0,1086.0500000000002,1.0,0.8833333333333333,0.6333333333333333
|
| 3 |
+
eval_results_avg32-global_step_5,1.1,16.7,1183.2,0.4,0.0,1183.2,0.0,1183.2,1.0,0.8,0.5,0.2,6.7,1513.0333333333333,0.4666666666666667,0,1513.0333333333333,0.03333333333333333,1016.0689655172414,0.9666666666666667,0.9333333333333333,0.5333333333333333,0.65,11.7,1348.1166666666668,0.43333333333333335,0.0,1348.1166666666668,0.016666666666666666,1099.6344827586208,0.9833333333333334,0.8666666666666667,0.5166666666666666
|
| 4 |
+
eval_results_avg32-global_step_10,1.1,10.0,1363.0,0.23333333333333334,0.0,1363.0,0.03333333333333333,859.4827586206897,0.9666666666666667,0.7666666666666667,0.6,0.2,6.7,1112.6,0.16666666666666666,0,1112.6,0.0,1112.6,1.0,1.0,0.5,0.65,8.35,1237.8,0.2,0.0,1237.8,0.016666666666666666,986.0413793103448,0.9833333333333334,0.8833333333333333,0.55
|
| 5 |
+
eval_results_avg32-global_step_15,0.9,10.0,866.8333333333334,0.2,0.0,866.8333333333334,0.0,866.8333333333334,1.0,0.9,0.5,0.1,3.3,999.2333333333333,0.4,0,999.2333333333333,0.0,999.2333333333333,1.0,0.9666666666666667,0.5666666666666667,0.5,6.65,933.0333333333333,0.30000000000000004,0.0,933.0333333333333,0.0,933.0333333333333,1.0,0.9333333333333333,0.5333333333333333
|
| 6 |
+
eval_results_avg32-global_step_20,1.7,16.7,935.1666666666666,0.36666666666666664,435.0,952.4137931034483,0.0,935.1666666666666,1.0,1.0,0.4666666666666667,0.0,0.0,792.9666666666667,0.43333333333333335,0,792.9666666666667,0.0,792.9666666666667,1.0,1.0,0.36666666666666664,0.85,8.35,864.0666666666666,0.4,217.5,872.6902298850575,0.0,864.0666666666666,1.0,1.0,0.41666666666666663
|
| 7 |
+
eval_results_avg32-global_step_25,2.1,20.0,757.4333333333333,0.3,751.0,757.6551724137931,0.0,757.4333333333333,1.0,1.0,0.4666666666666667,0.1,3.3,818.6666666666666,0.36666666666666664,0,818.6666666666666,0.0,818.6666666666666,1.0,1.0,0.5,1.1,11.65,788.05,0.3333333333333333,375.5,788.1609195402299,0.0,788.05,1.0,1.0,0.48333333333333334
|
| 8 |
+
eval_results_avg32-global_step_30,2.0,10.0,1289.6333333333334,0.36666666666666664,585.0,1313.9310344827586,0.0,1289.6333333333334,1.0,1.0,0.43333333333333335,0.1,3.3,1110.5,0.23333333333333334,0,1110.5,0.0,1110.5,1.0,1.0,0.43333333333333335,1.05,6.65,1200.0666666666666,0.3,292.5,1212.2155172413793,0.0,1200.0666666666666,1.0,1.0,0.43333333333333335
|
| 9 |
+
eval_results_avg32-global_step_35,2.6,16.7,823.9333333333333,0.2,632.5,837.6071428571429,0.0,823.9333333333333,1.0,1.0,0.4,0.2,6.7,785.9,0.16666666666666666,0,785.9,0.0,785.9,1.0,1.0,0.3,1.4000000000000001,11.7,804.9166666666666,0.18333333333333335,316.25,811.7535714285714,0.0,804.9166666666666,1.0,1.0,0.35
|
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 0.6,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.6,
|
| 10 |
+
"2": 1.2,
|
| 11 |
+
"4": 2.4,
|
| 12 |
+
"8": 4.6,
|
| 13 |
+
"16": 8.4,
|
| 14 |
+
"32": 13.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 245.02268433570862,
|
| 17 |
+
"time_use_in_minite": "4:05"
|
| 18 |
+
}
|
eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.2,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.2,
|
| 10 |
+
"2": 0.4,
|
| 11 |
+
"4": 0.8,
|
| 12 |
+
"8": 1.7,
|
| 13 |
+
"16": 3.3,
|
| 14 |
+
"32": 6.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 300.4465308189392,
|
| 17 |
+
"time_use_in_minite": "5:00"
|
| 18 |
+
}
|
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 1.1,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 1.1,
|
| 10 |
+
"2": 2.2,
|
| 11 |
+
"4": 3.9,
|
| 12 |
+
"8": 6.5,
|
| 13 |
+
"16": 9.0,
|
| 14 |
+
"32": 10.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 277.7810573577881,
|
| 17 |
+
"time_use_in_minite": "4:37"
|
| 18 |
+
}
|
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.2,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.2,
|
| 10 |
+
"2": 0.4,
|
| 11 |
+
"4": 0.8,
|
| 12 |
+
"8": 1.7,
|
| 13 |
+
"16": 3.3,
|
| 14 |
+
"32": 6.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 234.06970882415771,
|
| 17 |
+
"time_use_in_minite": "3:54"
|
| 18 |
+
}
|
eval_results_avg32/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.9,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.9,
|
| 10 |
+
"2": 1.8,
|
| 11 |
+
"4": 3.3,
|
| 12 |
+
"8": 5.5,
|
| 13 |
+
"16": 8.0,
|
| 14 |
+
"32": 10.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 212.3564326763153,
|
| 17 |
+
"time_use_in_minite": "3:32"
|
| 18 |
+
}
|
eval_results_avg32/global_step_15/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_15/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.1,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.1,
|
| 10 |
+
"2": 0.2,
|
| 11 |
+
"4": 0.4,
|
| 12 |
+
"8": 0.8,
|
| 13 |
+
"16": 1.7,
|
| 14 |
+
"32": 3.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 242.44278526306152,
|
| 17 |
+
"time_use_in_minite": "4:02"
|
| 18 |
+
}
|
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 1.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 1.7,
|
| 10 |
+
"2": 3.1,
|
| 11 |
+
"4": 5.4,
|
| 12 |
+
"8": 8.6,
|
| 13 |
+
"16": 12.4,
|
| 14 |
+
"32": 16.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 183.82105946540833,
|
| 17 |
+
"time_use_in_minite": "3:03"
|
| 18 |
+
}
|
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.0,
|
| 7 |
+
"pass_acc": 0.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.0,
|
| 10 |
+
"2": 0.0,
|
| 11 |
+
"4": 0.0,
|
| 12 |
+
"8": 0.0,
|
| 13 |
+
"16": 0.0,
|
| 14 |
+
"32": 0.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 214.73410415649414,
|
| 17 |
+
"time_use_in_minite": "3:34"
|
| 18 |
+
}
|
eval_results_avg32/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|