Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg4/eval_results.csv +12 -0
- eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 4,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.6,
|
| 7 |
+
"pass_acc": 33.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.6,
|
| 10 |
+
"2": 7.7,
|
| 11 |
+
"4": 12.0,
|
| 12 |
+
"8": 18.2,
|
| 13 |
+
"16": 25.9,
|
| 14 |
+
"32": 33.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 515.7602069377899,
|
| 17 |
+
"time_use_in_minite": "8:35"
|
| 18 |
+
}
|
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7,
|
| 10 |
+
"2": 9.8,
|
| 11 |
+
"4": 14.1,
|
| 12 |
+
"8": 19.9,
|
| 13 |
+
"16": 26.2,
|
| 14 |
+
"32": 30.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 671.4177906513214,
|
| 17 |
+
"time_use_in_minite": "11:11"
|
| 18 |
+
}
|
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 4,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.9,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.9,
|
| 10 |
+
"2": 6.4,
|
| 11 |
+
"4": 9.7,
|
| 12 |
+
"8": 14.1,
|
| 13 |
+
"16": 19.9,
|
| 14 |
+
"32": 26.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 513.6733276844025,
|
| 17 |
+
"time_use_in_minite": "8:33"
|
| 18 |
+
}
|
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 36.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7,
|
| 10 |
+
"2": 10.3,
|
| 11 |
+
"4": 14.7,
|
| 12 |
+
"8": 20.5,
|
| 13 |
+
"16": 28.0,
|
| 14 |
+
"32": 36.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 720.7702496051788,
|
| 17 |
+
"time_use_in_minite": "12:00"
|
| 18 |
+
}
|
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.9,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.9,
|
| 10 |
+
"2": 8.1,
|
| 11 |
+
"4": 12.3,
|
| 12 |
+
"8": 17.5,
|
| 13 |
+
"16": 24.1,
|
| 14 |
+
"32": 30.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 463.51481533050537,
|
| 17 |
+
"time_use_in_minite": "7:43"
|
| 18 |
+
}
|
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.2,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.2,
|
| 10 |
+
"2": 9.7,
|
| 11 |
+
"4": 13.8,
|
| 12 |
+
"8": 18.6,
|
| 13 |
+
"16": 23.5,
|
| 14 |
+
"32": 26.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 645.1512818336487,
|
| 17 |
+
"time_use_in_minite": "10:45"
|
| 18 |
+
}
|
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.6,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.6,
|
| 10 |
+
"2": 6.9,
|
| 11 |
+
"4": 9.8,
|
| 12 |
+
"8": 13.9,
|
| 13 |
+
"16": 19.8,
|
| 14 |
+
"32": 30.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 432.0119540691376,
|
| 17 |
+
"time_use_in_minite": "7:12"
|
| 18 |
+
}
|
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 7.7,
|
| 7 |
+
"pass_acc": 33.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.7,
|
| 10 |
+
"2": 11.7,
|
| 11 |
+
"4": 16.2,
|
| 12 |
+
"8": 21.2,
|
| 13 |
+
"16": 26.7,
|
| 14 |
+
"32": 33.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 590.3402507305145,
|
| 17 |
+
"time_use_in_minite": "9:50"
|
| 18 |
+
}
|
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 5.2,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 5.2,
|
| 10 |
+
"2": 8.3,
|
| 11 |
+
"4": 11.9,
|
| 12 |
+
"8": 16.1,
|
| 13 |
+
"16": 21.0,
|
| 14 |
+
"32": 26.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 473.4737560749054,
|
| 17 |
+
"time_use_in_minite": "7:53"
|
| 18 |
+
}
|
eval_results_avg4/eval_results.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results_avg4-global_step_0,5.0,13.3,1426.8666666666666,0.3,611.0,1485.142857142857,0.03333333333333333,924.3448275862069,0.9666666666666667,0.8666666666666667,0.5333333333333333,3.3,10.0,2181.5666666666666,0.9,1284.0,2245.6785714285716,0.06666666666666667,1194.5,0.9333333333333333,0.8,0.7333333333333333,26.9,55.0,1568.425,0.225,777.0,1907.607142857143,0.05,809.0,0.95,0.875,0.725,11.733333333333334,26.099999999999998,1725.6194444444445,0.47500000000000003,890.6666666666666,1879.4761904761906,0.05000000000000001,975.948275862069,0.9499999999999998,0.8472222222222223,0.6638888888888889
|
| 3 |
+
eval_results_avg4-global_step_10,5.8,10.0,1305.3,0.4,786.0,1342.392857142857,0.03333333333333333,798.551724137931,0.9666666666666667,0.8666666666666667,0.8,0.8,3.3,1669.7333333333333,0.43333333333333335,0.0,1669.7333333333333,0.03333333333333333,1177.4137931034484,0.9666666666666667,0.9666666666666667,0.7,27.5,55.0,1846.1,1.4,634.0714285714286,2498.730769230769,0.05,1101.1315789473683,0.95,0.925,0.575,11.366666666666667,22.766666666666666,1607.0444444444445,0.7444444444444445,473.35714285714283,1836.95231990232,0.03888888888888889,1025.6990320629159,0.9611111111111111,0.9194444444444446,0.6916666666666668
|
| 4 |
+
eval_results_avg4-global_step_20,3.3,3.3,3027.3,0.5333333333333333,726.0,3106.655172413793,0.13333333333333333,1035.576923076923,0.8666666666666667,0.8666666666666667,0.8666666666666667,3.3,6.7,2293.4,0.26666666666666666,759.0,2346.310344827586,0.1,770.4814814814815,0.9,0.9,0.7333333333333333,33.1,62.5,1164.075,0.225,692.875,1478.2083333333333,0.0,1164.075,1.0,0.975,0.775,13.233333333333334,24.166666666666668,2161.5916666666667,0.34166666666666673,725.9583333333334,2310.391283524904,0.07777777777777778,990.0444681861348,0.9222222222222222,0.9138888888888889,0.7916666666666666
|
| 5 |
+
eval_results_avg4-global_step_30,4.2,10.0,2811.7,0.43333333333333335,0.0,2811.7,0.13333333333333333,782.7692307692307,0.8666666666666667,0.8666666666666667,0.7666666666666667,0.8,3.3,1411.9333333333334,0.36666666666666664,0.0,1411.9333333333334,0.03333333333333333,908.8965517241379,0.9666666666666667,0.9666666666666667,0.8333333333333334,30.6,57.5,1902.8,0.225,570.7272727272727,2408.0689655172414,0.075,755.7567567567568,0.925,0.925,0.7,11.866666666666667,23.599999999999998,2042.1444444444444,0.34166666666666673,190.24242424242425,2210.5674329501912,0.08055555555555555,815.807513083375,0.9194444444444446,0.9194444444444446,0.7666666666666666
|
| 6 |
+
eval_results_avg4-global_step_40,6.7,10.0,3329.766666666667,0.26666666666666666,699.5,3517.6428571428573,0.13333333333333333,1380.7692307692307,0.8666666666666667,0.8333333333333334,0.8,4.2,10.0,1305.2333333333333,0.5,638.0,1352.892857142857,0.03333333333333333,798.551724137931,0.9666666666666667,0.9666666666666667,0.6666666666666666,37.5,60.0,884.425,0.325,629.3571428571429,1021.7692307692307,0.0,884.425,1.0,0.975,0.575,16.133333333333333,26.666666666666668,1839.8083333333334,0.3638888888888889,655.6190476190476,1964.1016483516485,0.05555555555555555,1021.2486516357206,0.9444444444444445,0.9249999999999999,0.6805555555555557
|
| 7 |
+
eval_results_avg4-global_step_50,7.5,13.3,2812.766666666667,0.5,651.0,2887.310344827586,0.13333333333333333,784.1153846153846,0.8666666666666667,0.8666666666666667,0.7,4.2,10.0,2894.366666666667,0.26666666666666666,0.0,2894.366666666667,0.13333333333333333,878.3076923076923,0.8666666666666667,0.8666666666666667,0.7666666666666667,32.5,50.0,1760.95,0.15,625.7,2139.366666666667,0.05,1011.578947368421,0.95,0.925,0.775,14.733333333333334,24.433333333333334,2489.361111111111,0.3055555555555555,425.56666666666666,2640.3478927203064,0.10555555555555556,891.334008097166,0.8944444444444445,0.8861111111111111,0.7472222222222222
|
| 8 |
+
eval_results_avg4-global_step_60,6.7,10.0,2675.5666666666666,0.3,617.0,2822.6071428571427,0.1,1195.037037037037,0.9,0.8666666666666667,0.8666666666666667,4.2,6.7,1860.5,7.566666666666666,960.0,1891.551724137931,0.06666666666666667,850.5357142857143,0.9333333333333333,0.9333333333333333,0.7666666666666667,37.5,52.5,1172.65,0.3,729.0769230769231,1386.2222222222222,0.025,792.4102564102565,0.975,0.975,0.6,16.133333333333333,23.066666666666666,1902.9055555555558,2.722222222222222,768.6923076923076,2033.4603630724316,0.0638888888888889,945.9943359110025,0.9361111111111112,0.9249999999999999,0.7444444444444445
|
| 9 |
+
eval_results_avg4-global_step_70,7.5,13.3,2323.1,2.6666666666666665,663.0,2507.5555555555557,0.1,804.8518518518518,0.9,0.9,0.8666666666666667,4.2,10.0,1029.3666666666666,0.36666666666666664,956.5,1034.5714285714287,0.0,1029.3666666666666,1.0,0.9666666666666667,0.6333333333333333,38.1,57.5,2180.425,0.275,685.8461538461538,2900.037037037037,0.075,1059.918918918919,0.925,0.9,0.75,16.599999999999998,26.933333333333334,1844.2972222222222,1.1027777777777776,768.4487179487179,2147.3880070546734,0.05833333333333333,964.7124791458124,0.9416666666666668,0.9222222222222222,0.75
|
| 10 |
+
eval_results_avg4-global_step_80,6.7,16.7,3168.5,0.3,773.0,3251.103448275862,0.13333333333333333,1194.4615384615386,0.8666666666666667,0.8333333333333334,0.7666666666666667,3.3,3.3,967.0333333333333,0.5333333333333333,1042.0,964.448275862069,0.0,967.0333333333333,1.0,1.0,0.6333333333333333,38.1,60.0,1234.025,2.8,685.1538461538462,1498.2962962962963,0.025,855.4102564102565,0.975,0.975,0.55,16.033333333333335,26.666666666666668,1789.8527777777774,1.211111111111111,833.3846153846154,1904.6160068114093,0.05277777777777778,1005.6350427350427,0.9472222222222223,0.9361111111111112,0.65
|
| 11 |
+
eval_results_avg4-global_step_90,9.2,20.0,3377.366666666667,0.3,795.75,3774.5384615384614,0.16666666666666666,852.76,0.8333333333333334,0.8333333333333334,0.9,3.3,6.7,1786.9333333333334,0.5333333333333333,0.0,1786.9333333333334,0.06666666666666667,783.1785714285714,0.9333333333333333,0.9333333333333333,0.7,38.1,60.0,1326.225,0.275,689.5,1599.107142857143,0.025,949.974358974359,0.975,0.975,0.775,16.866666666666667,28.900000000000002,2163.508333333333,0.36944444444444446,495.0833333333333,2386.859645909646,0.08611111111111112,861.9709768009767,0.9138888888888889,0.9138888888888889,0.7916666666666666
|
| 12 |
+
eval_results_avg4-global_step_100,7.5,10.0,2870.633333333333,0.5333333333333333,606.0,3032.3928571428573,0.06666666666666667,1948.892857142857,0.9333333333333333,0.8666666666666667,0.8333333333333334,7.5,16.7,1585.2,0.36666666666666664,922.0,1632.5714285714287,0.03333333333333333,1088.1724137931035,0.9666666666666667,0.9333333333333333,0.7333333333333333,36.9,60.0,1372.1,0.225,655.2307692307693,1717.2592592592594,0.025,996.8205128205128,0.975,0.95,0.625,17.3,28.900000000000002,1942.6444444444442,0.375,727.7435897435898,2127.4078483245153,0.041666666666666664,1344.628594585491,0.9583333333333334,0.9166666666666666,0.7305555555555555
|
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 5.0,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 5.0,
|
| 10 |
+
"2": 8.9,
|
| 11 |
+
"4": 13.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 185.94950437545776,
|
| 14 |
+
"time_use_in_minite": "3:05"
|
| 15 |
+
}
|
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 6.1,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 193.96814250946045,
|
| 14 |
+
"time_use_in_minite": "3:13"
|
| 15 |
+
}
|
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 4,
|
| 6 |
+
"acc": 26.9,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.9,
|
| 10 |
+
"2": 41.2,
|
| 11 |
+
"4": 55.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 202.22902059555054,
|
| 14 |
+
"time_use_in_minite": "3:22"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 5.8,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 5.8,
|
| 10 |
+
"2": 7.8,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 172.10288619995117,
|
| 14 |
+
"time_use_in_minite": "2:52"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.8,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.8,
|
| 10 |
+
"2": 1.7,
|
| 11 |
+
"4": 3.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 188.8493616580963,
|
| 14 |
+
"time_use_in_minite": "3:08"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 27.5,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 27.5,
|
| 10 |
+
"2": 41.2,
|
| 11 |
+
"4": 55.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 193.1791069507599,
|
| 14 |
+
"time_use_in_minite": "3:13"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 7.5,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.5,
|
| 10 |
+
"2": 9.4,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 186.0230197906494,
|
| 14 |
+
"time_use_in_minite": "3:06"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 7.5,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.5,
|
| 10 |
+
"2": 11.1,
|
| 11 |
+
"4": 16.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 176.03479647636414,
|
| 14 |
+
"time_use_in_minite": "2:56"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 36.9,
|
| 7 |
+
"pass_acc": 60.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 36.9,
|
| 10 |
+
"2": 48.3,
|
| 11 |
+
"4": 60.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 170.67507100105286,
|
| 14 |
+
"time_use_in_minite": "2:50"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 3.3,
|
| 11 |
+
"4": 3.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 190.46656322479248,
|
| 14 |
+
"time_use_in_minite": "3:10"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 179.1738040447235,
|
| 14 |
+
"time_use_in_minite": "2:59"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|