Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
- eval_results_avg4/eval_results.csv +12 -0
- eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 5.8,
|
| 11 |
+
"4": 9.3,
|
| 12 |
+
"8": 13.9,
|
| 13 |
+
"16": 19.3,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 529.1417696475983,
|
| 17 |
+
"time_use_in_minite": "8:49"
|
| 18 |
+
}
|
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 7.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 7.3,
|
| 10 |
+
"2": 11.1,
|
| 11 |
+
"4": 15.3,
|
| 12 |
+
"8": 19.4,
|
| 13 |
+
"16": 22.3,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 819.9961113929749,
|
| 17 |
+
"time_use_in_minite": "13:39"
|
| 18 |
+
}
|
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 4.3,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.3,
|
| 10 |
+
"2": 7.1,
|
| 11 |
+
"4": 11.0,
|
| 12 |
+
"8": 16.1,
|
| 13 |
+
"16": 22.4,
|
| 14 |
+
"32": 30.0
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 577.8698544502258,
|
| 17 |
+
"time_use_in_minite": "9:37"
|
| 18 |
+
}
|
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.5,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.5,
|
| 10 |
+
"2": 9.0,
|
| 11 |
+
"4": 12.0,
|
| 12 |
+
"8": 15.8,
|
| 13 |
+
"16": 20.1,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 829.8540849685669,
|
| 17 |
+
"time_use_in_minite": "13:49"
|
| 18 |
+
}
|
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.6,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.6,
|
| 10 |
+
"2": 5.9,
|
| 11 |
+
"4": 8.7,
|
| 12 |
+
"8": 11.9,
|
| 13 |
+
"16": 16.3,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 546.3630204200745,
|
| 17 |
+
"time_use_in_minite": "9:06"
|
| 18 |
+
}
|
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.9,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.9,
|
| 10 |
+
"2": 10.1,
|
| 11 |
+
"4": 13.8,
|
| 12 |
+
"8": 18.1,
|
| 13 |
+
"16": 22.4,
|
| 14 |
+
"32": 26.7
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 647.9162139892578,
|
| 17 |
+
"time_use_in_minite": "10:47"
|
| 18 |
+
}
|
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.2,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.2,
|
| 10 |
+
"2": 5.4,
|
| 11 |
+
"4": 8.2,
|
| 12 |
+
"8": 11.6,
|
| 13 |
+
"16": 16.3,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 469.15487265586853,
|
| 17 |
+
"time_use_in_minite": "7:49"
|
| 18 |
+
}
|
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 8.3,
|
| 7 |
+
"pass_acc": 33.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 8.3,
|
| 10 |
+
"2": 12.4,
|
| 11 |
+
"4": 17.1,
|
| 12 |
+
"8": 22.3,
|
| 13 |
+
"16": 27.9,
|
| 14 |
+
"32": 33.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 582.5299682617188,
|
| 17 |
+
"time_use_in_minite": "9:42"
|
| 18 |
+
}
|
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 960,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.7,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.7,
|
| 10 |
+
"2": 7.5,
|
| 11 |
+
"4": 10.8,
|
| 12 |
+
"8": 14.6,
|
| 13 |
+
"16": 19.1,
|
| 14 |
+
"32": 23.3
|
| 15 |
+
},
|
| 16 |
+
"time_use_in_second": 463.4439477920532,
|
| 17 |
+
"time_use_in_minite": "7:43"
|
| 18 |
+
}
|
eval_results_avg4/eval_results.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results_avg4-global_step_0,3.3,13.3,1018.9666666666667,0.13333333333333333,915.0,1022.551724137931,0.0,1018.9666666666667,1.0,0.9,0.6333333333333333,2.5,10.0,1953.8,0.3,0.0,1953.8,0.03333333333333333,1469.448275862069,0.9666666666666667,0.9,0.7333333333333333,22.5,52.5,1373.575,0.275,2476.1,1006.0666666666667,0.025,968.3846153846154,0.975,0.85,0.6,9.433333333333334,25.266666666666666,1448.7805555555553,0.23611111111111113,1130.3666666666666,1327.4727969348658,0.019444444444444445,1152.2665193044504,0.9805555555555556,0.8833333333333333,0.6555555555555556
|
| 3 |
+
eval_results_avg4-global_step_10,8.3,20.0,2643.266666666667,2.966666666666667,728.0,2856.074074074074,0.1,1160.5555555555557,0.9,0.8333333333333334,0.7666666666666667,4.2,13.3,2264.0333333333333,0.3,1845.0,2278.4827586206898,0.06666666666666667,1283.0714285714287,0.9333333333333333,0.8666666666666667,0.6333333333333333,30.0,52.5,1322.3,3.05,738.9166666666666,1572.3214285714287,0.025,943.5641025641025,0.975,0.975,0.775,14.166666666666666,28.599999999999998,2076.5333333333333,2.1055555555555556,1103.9722222222222,2235.6260870887304,0.0638888888888889,1129.0636955636955,0.9361111111111112,0.8916666666666667,0.725
|
| 4 |
+
eval_results_avg4-global_step_20,4.2,10.0,3127.3333333333335,1.2,0.0,3127.3333333333335,0.1,1697.148148148148,0.9,0.8,0.8,1.7,6.7,1809.2333333333333,0.2,0.0,1809.2333333333333,0.06666666666666667,795.6071428571429,0.9333333333333333,0.9333333333333333,0.7,32.5,57.5,1592.275,0.525,2081.0,1429.3666666666666,0.05,834.0263157894736,0.95,0.95,0.675,12.799999999999999,24.733333333333334,2176.2805555555556,0.6416666666666666,693.6666666666666,2121.9777777777776,0.07222222222222223,1108.9272022649216,0.9277777777777777,0.8944444444444445,0.725
|
| 5 |
+
eval_results_avg4-global_step_30,9.2,20.0,3154.3,0.9666666666666667,697.0,3329.8214285714284,0.13333333333333333,1178.1153846153845,0.8666666666666667,0.8,0.8,3.3,10.0,2426.0666666666666,0.3,1031.5,2525.6785714285716,0.1,918.0370370370371,0.9,0.9,0.7,37.5,60.0,2099.325,0.3,1655.1764705882354,2427.608695652174,0.05,1367.7894736842106,0.95,0.9,0.675,16.666666666666668,30.0,2559.8972222222224,0.5222222222222223,1127.892156862745,2761.036231884058,0.09444444444444444,1154.647298445544,0.9055555555555556,0.8666666666666667,0.725
|
| 6 |
+
eval_results_avg4-global_step_40,5.0,13.3,2390.133333333333,0.23333333333333334,738.5,2508.1071428571427,0.1,874.7777777777778,0.9,0.9,0.7666666666666667,3.3,13.3,1361.8333333333333,0.36666666666666664,690.0,1385.0,0.03333333333333333,857.0689655172414,0.9666666666666667,0.9666666666666667,0.8333333333333334,32.5,47.5,1607.7,0.25,720.1,1903.5666666666666,0.05,874.3947368421053,0.95,0.95,0.675,13.6,24.7,1786.5555555555554,0.2833333333333333,716.1999999999999,1932.2246031746029,0.061111111111111116,868.7471600457083,0.9388888888888888,0.9388888888888888,0.7583333333333334
|
| 7 |
+
eval_results_avg4-global_step_50,8.3,13.3,2279.633333333333,5.9,742.5,2389.4285714285716,0.1,761.3703703703703,0.9,0.9,0.6666666666666666,4.2,6.7,1324.0333333333333,0.5333333333333333,1383.0,1322.0,0.03333333333333333,817.9310344827586,0.9666666666666667,0.9666666666666667,0.5333333333333333,34.4,57.5,2250.7,0.225,643.1818181818181,2860.448275862069,0.075,1133.7027027027027,0.925,0.9,0.7,15.633333333333333,25.833333333333332,1951.4555555555555,2.2194444444444446,922.8939393939394,2190.6256157635466,0.06944444444444443,904.3347025186107,0.9305555555555557,0.9222222222222222,0.6333333333333333
|
| 8 |
+
eval_results_avg4-global_step_60,7.5,16.7,2607.233333333333,0.06666666666666667,640.3333333333334,2825.777777777778,0.1,1119.4074074074074,0.9,0.8333333333333334,0.7,3.3,6.7,2862.133333333333,0.36666666666666664,967.0,2997.5,0.13333333333333333,841.0384615384615,0.8666666666666667,0.8666666666666667,0.7,36.2,55.0,763.875,0.175,687.2,809.88,0.0,763.875,1.0,1.0,0.65,15.666666666666666,26.133333333333336,2077.7472222222223,0.20277777777777775,764.8444444444445,2211.0525925925926,0.07777777777777778,908.1069563152896,0.9222222222222222,0.9,0.6833333333333332
|
| 9 |
+
eval_results_avg4-global_step_70,10.0,13.3,2860.3,0.2,633.5,3019.3571428571427,0.1,1400.5185185185185,0.9,0.8666666666666667,0.8333333333333334,6.7,20.0,1896.0666666666666,7.866666666666666,864.5,2054.769230769231,0.06666666666666667,888.6071428571429,0.9333333333333333,0.9333333333333333,0.8333333333333334,33.8,50.0,899.225,0.175,651.1428571428571,1032.8076923076924,0.0,899.225,1.0,0.975,0.75,16.833333333333332,27.766666666666666,1885.1972222222223,2.7472222222222222,716.3809523809523,2035.6446886446886,0.05555555555555556,1062.783553791887,0.9444444444444445,0.9249999999999999,0.8055555555555557
|
| 10 |
+
eval_results_avg4-global_step_80,7.5,10.0,1627.2333333333333,0.43333333333333335,779.3333333333334,1721.4444444444443,0.03333333333333333,1131.655172413793,0.9666666666666667,0.9333333333333333,0.8333333333333334,4.2,10.0,1895.1,0.4,997.5,1959.2142857142858,0.06666666666666667,887.6428571428571,0.9333333333333333,0.9333333333333333,0.7,36.2,52.5,1330.375,0.3,692.25,1755.7916666666667,0.025,954.2564102564103,0.975,0.95,0.725,15.966666666666669,24.166666666666668,1617.5694444444443,0.37777777777777777,823.0277777777778,1812.1501322751321,0.041666666666666664,991.1848132710202,0.9583333333333334,0.9388888888888888,0.7527777777777778
|
| 11 |
+
eval_results_avg4-global_step_90,5.0,13.3,3021.133333333333,0.26666666666666666,1048.75,3324.576923076923,0.1,1579.4444444444443,0.9,0.8333333333333334,0.8666666666666667,4.2,6.7,1086.6333333333334,0.6666666666666666,620.0,1102.7241379310344,0.0,1086.6333333333334,1.0,0.9666666666666667,0.7,39.4,60.0,912.675,0.2,678.6666666666666,1053.08,0.0,912.675,1.0,1.0,0.75,16.2,26.666666666666668,1673.4805555555556,0.37777777777777777,782.4722222222222,1826.7936870026524,0.03333333333333333,1192.9175925925927,0.9666666666666667,0.9333333333333332,0.7722222222222221
|
| 12 |
+
eval_results_avg4-global_step_100,9.2,16.7,2744.766666666667,0.23333333333333334,1203.0,2797.9310344827586,0.1,1271.888888888889,0.9,0.9,0.7333333333333333,5.0,6.7,1921.2666666666667,0.43333333333333335,1063.0,1950.8620689655172,0.06666666666666667,915.6071428571429,0.9333333333333333,0.9333333333333333,0.7,39.4,57.5,1210.475,0.225,691.0,1556.7916666666667,0.025,831.2820512820513,0.975,0.975,0.725,17.866666666666664,26.96666666666667,1958.836111111111,0.2972222222222222,985.6666666666666,2101.8615900383143,0.0638888888888889,1006.259361009361,0.9361111111111112,0.9361111111111112,0.7194444444444444
|
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 6.7,
|
| 11 |
+
"4": 13.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 171.29286766052246,
|
| 14 |
+
"time_use_in_minite": "2:51"
|
| 15 |
+
}
|
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 177.67143607139587,
|
| 14 |
+
"time_use_in_minite": "2:57"
|
| 15 |
+
}
|
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 22.5,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 22.5,
|
| 10 |
+
"2": 34.6,
|
| 11 |
+
"4": 52.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 174.92642736434937,
|
| 14 |
+
"time_use_in_minite": "2:54"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 8.3,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 8.3,
|
| 10 |
+
"2": 12.8,
|
| 11 |
+
"4": 20.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 225.51578545570374,
|
| 14 |
+
"time_use_in_minite": "3:45"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 4.2,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.2,
|
| 10 |
+
"2": 7.8,
|
| 11 |
+
"4": 13.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 197.61598801612854,
|
| 14 |
+
"time_use_in_minite": "3:17"
|
| 15 |
+
}
|
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 30.0,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 30.0,
|
| 10 |
+
"2": 40.0,
|
| 11 |
+
"4": 52.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 197.73797011375427,
|
| 14 |
+
"time_use_in_minite": "3:17"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 9.2,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 9.2,
|
| 10 |
+
"2": 12.8,
|
| 11 |
+
"4": 16.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 192.01610326766968,
|
| 14 |
+
"time_use_in_minite": "3:12"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 5.0,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 5.0,
|
| 10 |
+
"2": 6.1,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 188.81494522094727,
|
| 14 |
+
"time_use_in_minite": "3:08"
|
| 15 |
+
}
|
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 39.4,
|
| 7 |
+
"pass_acc": 57.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 39.4,
|
| 10 |
+
"2": 50.4,
|
| 11 |
+
"4": 57.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 175.83723950386047,
|
| 14 |
+
"time_use_in_minite": "2:55"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.2,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.2,
|
| 10 |
+
"2": 6.7,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 197.9232029914856,
|
| 14 |
+
"time_use_in_minite": "3:17"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|