Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_merged/merged.csv +12 -0
- eval_results_merged_v3/merged.csv +12 -0
- eval_results_merged_v3/missing.txt +11 -0
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 6.1,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 181.98684406280518,
|
| 14 |
+
"time_use_in_minite": "3:01"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 4.4,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 176.9613161087036,
|
| 14 |
+
"time_use_in_minite": "2:56"
|
| 15 |
+
}
|
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 31.9,
|
| 7 |
+
"pass_acc": 57.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 31.9,
|
| 10 |
+
"2": 44.6,
|
| 11 |
+
"4": 57.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 177.31663274765015,
|
| 14 |
+
"time_use_in_minite": "2:57"
|
| 15 |
+
}
|
eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 8.3,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 8.3,
|
| 10 |
+
"2": 12.8,
|
| 11 |
+
"4": 16.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 171.64313411712646,
|
| 14 |
+
"time_use_in_minite": "2:51"
|
| 15 |
+
}
|
eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.2,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.2,
|
| 10 |
+
"2": 6.7,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 158.31116390228271,
|
| 14 |
+
"time_use_in_minite": "2:38"
|
| 15 |
+
}
|
eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 33.8,
|
| 7 |
+
"pass_acc": 50.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 33.8,
|
| 10 |
+
"2": 42.9,
|
| 11 |
+
"4": 50.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 166.854505777359,
|
| 14 |
+
"time_use_in_minite": "2:46"
|
| 15 |
+
}
|
eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 4.2,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 4.2,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 168.94713473320007,
|
| 14 |
+
"time_use_in_minite": "2:48"
|
| 15 |
+
}
|
eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 4.4,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 141.42398166656494,
|
| 14 |
+
"time_use_in_minite": "2:21"
|
| 15 |
+
}
|
eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 31.2,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 31.2,
|
| 10 |
+
"2": 42.5,
|
| 11 |
+
"4": 55.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 158.56345629692078,
|
| 14 |
+
"time_use_in_minite": "2:38"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 8.3,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 8.3,
|
| 10 |
+
"2": 12.8,
|
| 11 |
+
"4": 20.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 182.78652429580688,
|
| 14 |
+
"time_use_in_minite": "3:02"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.8,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.8,
|
| 10 |
+
"2": 1.7,
|
| 11 |
+
"4": 3.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 166.4728286266327,
|
| 14 |
+
"time_use_in_minite": "2:46"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 35.0,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 35.0,
|
| 10 |
+
"2": 44.6,
|
| 11 |
+
"4": 52.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 152.36885404586792,
|
| 14 |
+
"time_use_in_minite": "2:32"
|
| 15 |
+
}
|
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.0,
|
| 10 |
+
"2": 14.4,
|
| 11 |
+
"4": 20.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 161.36754608154297,
|
| 14 |
+
"time_use_in_minite": "2:41"
|
| 15 |
+
}
|
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 4.4,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 159.51917576789856,
|
| 14 |
+
"time_use_in_minite": "2:39"
|
| 15 |
+
}
|
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 30.0,
|
| 7 |
+
"pass_acc": 45.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 30.0,
|
| 10 |
+
"2": 38.3,
|
| 11 |
+
"4": 45.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 164.06099128723145,
|
| 14 |
+
"time_use_in_minite": "2:44"
|
| 15 |
+
}
|
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.8,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.8,
|
| 10 |
+
"2": 15.6,
|
| 11 |
+
"4": 20.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 161.0793993473053,
|
| 14 |
+
"time_use_in_minite": "2:41"
|
| 15 |
+
}
|
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 1.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 1.7,
|
| 10 |
+
"2": 3.3,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 164.75507164001465,
|
| 14 |
+
"time_use_in_minite": "2:44"
|
| 15 |
+
}
|
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 35.0,
|
| 7 |
+
"pass_acc": 47.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 35.0,
|
| 10 |
+
"2": 41.2,
|
| 11 |
+
"4": 47.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 160.2115559577942,
|
| 14 |
+
"time_use_in_minite": "2:40"
|
| 15 |
+
}
|
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7,
|
| 10 |
+
"2": 10.0,
|
| 11 |
+
"4": 13.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 176.63634634017944,
|
| 14 |
+
"time_use_in_minite": "2:56"
|
| 15 |
+
}
|
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.8,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.8,
|
| 10 |
+
"2": 1.7,
|
| 11 |
+
"4": 3.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 183.75958013534546,
|
| 14 |
+
"time_use_in_minite": "3:03"
|
| 15 |
+
}
|
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 35.0,
|
| 7 |
+
"pass_acc": 50.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 35.0,
|
| 10 |
+
"2": 42.5,
|
| 11 |
+
"4": 50.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 80.48573231697083,
|
| 14 |
+
"time_use_in_minite": "1:20"
|
| 15 |
+
}
|
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 12.5,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 12.5,
|
| 10 |
+
"2": 18.3,
|
| 11 |
+
"4": 23.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 164.63086247444153,
|
| 14 |
+
"time_use_in_minite": "2:44"
|
| 15 |
+
}
|
eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 4.4,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 187.7517955303192,
|
| 14 |
+
"time_use_in_minite": "3:07"
|
| 15 |
+
}
|
eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 29.4,
|
| 7 |
+
"pass_acc": 50.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 29.4,
|
| 10 |
+
"2": 39.6,
|
| 11 |
+
"4": 50.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 176.775892496109,
|
| 14 |
+
"time_use_in_minite": "2:56"
|
| 15 |
+
}
|
eval_results_merged/merged.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global_step,aime24_acc_avg4,aime25_acc_avg4,amc23_acc_avg4,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
|
| 2 |
+
0,5.80,1.70,25.60,0.00,0.00,25.00,59.60,46.00,15.10,16.70,40.00,17.7,0.24
|
| 3 |
+
10,3.30,3.30,30.00,13.30,3.30,22.50,75.40,56.40,21.00,23.10,43.40,19.8,0.27
|
| 4 |
+
20,3.30,2.50,31.90,0.00,0.00,32.50,79.00,60.00,28.70,25.80,47.00,23.7,0.26
|
| 5 |
+
30,8.30,4.20,33.80,6.70,3.30,35.00,79.60,63.20,26.10,25.00,49.10,27.0,0.25
|
| 6 |
+
40,4.20,2.50,31.20,3.30,0.00,45.00,81.20,62.80,26.80,27.90,51.90,27.9,0.31
|
| 7 |
+
50,8.30,0.80,35.00,6.70,0.00,37.50,81.30,63.40,25.70,27.40,52.40,29.0,0.27
|
| 8 |
+
60,10.00,2.50,30.00,6.70,3.30,27.50,82.80,65.80,27.60,28.30,52.60,27.0,0.30
|
| 9 |
+
70,10.80,1.70,35.00,3.30,3.30,32.50,83.20,63.00,27.20,25.20,53.30,29.2,0.26
|
| 10 |
+
80,6.70,0.80,35.00,3.30,3.30,30.00,84.20,64.40,29.00,27.10,55.10,29.2,0.32
|
| 11 |
+
90,12.50,2.50,29.40,3.30,3.30,35.00,83.50,64.20,29.40,28.60,56.70,28.1,0.27
|
| 12 |
+
100,10.00,2.50,28.70,3.30,3.30,37.50,83.30,64.60,26.50,27.90,57.70,30.9,0.29
|
eval_results_merged_v3/merged.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global_step,aime24_acc_avg16,aime25_acc_avg16,amc23_acc_avg16,aime24_acc_avg32,aime25_acc_avg32,amc23_acc_avg32,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
|
| 2 |
+
0,,,,,,,0.00,0.00,25.00,59.60,46.00,15.10,16.70,40.00,17.7,0.24
|
| 3 |
+
10,,,,,,,13.30,3.30,22.50,75.40,56.40,21.00,23.10,43.40,19.8,0.27
|
| 4 |
+
20,,,,,,,0.00,0.00,32.50,79.00,60.00,28.70,25.80,47.00,23.7,0.26
|
| 5 |
+
30,,,,,,,6.70,3.30,35.00,79.60,63.20,26.10,25.00,49.10,27.0,0.25
|
| 6 |
+
40,,,,,,,3.30,0.00,45.00,81.20,62.80,26.80,27.90,51.90,27.9,0.31
|
| 7 |
+
50,,,,,,,6.70,0.00,37.50,81.30,63.40,25.70,27.40,52.40,29.0,0.27
|
| 8 |
+
60,,,,,,,6.70,3.30,27.50,82.80,65.80,27.60,28.30,52.60,27.0,0.30
|
| 9 |
+
70,,,,,,,3.30,3.30,32.50,83.20,63.00,27.20,25.20,53.30,29.2,0.26
|
| 10 |
+
80,,,,,,,3.30,3.30,30.00,84.20,64.40,29.00,27.10,55.10,29.2,0.32
|
| 11 |
+
90,,,,,,,3.30,3.30,35.00,83.50,64.20,29.40,28.60,56.70,28.1,0.27
|
| 12 |
+
100,,,,,,,3.30,3.30,37.50,83.30,64.60,26.50,27.90,57.70,30.9,0.29
|
eval_results_merged_v3/missing.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step 0: missing avg16, avg32
|
| 2 |
+
step 10: missing avg16, avg32
|
| 3 |
+
step 20: missing avg16, avg32
|
| 4 |
+
step 30: missing avg16, avg32
|
| 5 |
+
step 40: missing avg16, avg32
|
| 6 |
+
step 50: missing avg16, avg32
|
| 7 |
+
step 60: missing avg16, avg32
|
| 8 |
+
step 70: missing avg16, avg32
|
| 9 |
+
step 80: missing avg16, avg32
|
| 10 |
+
step 90: missing avg16, avg32
|
| 11 |
+
step 100: missing avg16, avg32
|