Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 218.54596161842346,
|
| 12 |
+
"time_use_in_minite": "3:38"
|
| 13 |
+
}
|
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 210.55786085128784,
|
| 12 |
+
"time_use_in_minite": "3:30"
|
| 13 |
+
}
|
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 45.0,
|
| 7 |
+
"pass_acc": 45.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 45.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 56.58969306945801,
|
| 12 |
+
"time_use_in_minite": "0:56"
|
| 13 |
+
}
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 15,
|
| 6 |
+
"acc": 84.3,
|
| 7 |
+
"pass_acc": 84.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 84.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 335.95774841308594,
|
| 12 |
+
"time_use_in_minite": "5:35"
|
| 13 |
+
}
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 9,
|
| 6 |
+
"acc": 65.4,
|
| 7 |
+
"pass_acc": 65.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 65.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 380.1704812049866,
|
| 12 |
+
"time_use_in_minite": "6:20"
|
| 13 |
+
}
|
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 14,
|
| 6 |
+
"acc": 26.1,
|
| 7 |
+
"pass_acc": 26.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 35.4,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 38.5,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 20.8,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 16.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 18.2,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 44.4,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 27.3
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 243.3234167098999,
|
| 23 |
+
"time_use_in_minite": "4:03"
|
| 24 |
+
}
|
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 8,
|
| 5 |
+
"empty_samples": 11,
|
| 6 |
+
"acc": 60.7,
|
| 7 |
+
"pass_acc": 60.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 60.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 53.0,
|
| 13 |
+
"astronomy": 78.9,
|
| 14 |
+
"college_biology": 80.6,
|
| 15 |
+
"college_chemistry": 49.0,
|
| 16 |
+
"college_computer_science": 52.0,
|
| 17 |
+
"college_mathematics": 43.0,
|
| 18 |
+
"college_physics": 56.9,
|
| 19 |
+
"computer_security": 65.0,
|
| 20 |
+
"conceptual_physics": 74.5,
|
| 21 |
+
"electrical_engineering": 64.1,
|
| 22 |
+
"elementary_mathematics": 52.1,
|
| 23 |
+
"high_school_biology": 75.8,
|
| 24 |
+
"high_school_chemistry": 63.5,
|
| 25 |
+
"high_school_computer_science": 75.0,
|
| 26 |
+
"high_school_mathematics": 29.3,
|
| 27 |
+
"high_school_physics": 55.6,
|
| 28 |
+
"high_school_statistics": 63.0,
|
| 29 |
+
"machine_learning": 66.1
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 614.4746489524841,
|
| 32 |
+
"time_use_in_minite": "10:14"
|
| 33 |
+
}
|
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 9,
|
| 6 |
+
"acc": 31.1,
|
| 7 |
+
"pass_acc": 31.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 31.1
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 503.0589566230774,
|
| 12 |
+
"time_use_in_minite": "8:23"
|
| 13 |
+
}
|
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 241.16228032112122,
|
| 12 |
+
"time_use_in_minite": "4:01"
|
| 13 |
+
}
|
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 229.15140557289124,
|
| 12 |
+
"time_use_in_minite": "3:49"
|
| 13 |
+
}
|
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 91.8,
|
| 7 |
+
"pass_acc": 91.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 91.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 273.5060772895813,
|
| 12 |
+
"time_use_in_minite": "4:33"
|
| 13 |
+
}
|
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 79.2,
|
| 7 |
+
"pass_acc": 79.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 79.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 349.78808665275574,
|
| 12 |
+
"time_use_in_minite": "5:49"
|
| 13 |
+
}
|
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 64.5,
|
| 7 |
+
"pass_acc": 64.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 64.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 63.0,
|
| 13 |
+
"astronomy": 81.6,
|
| 14 |
+
"college_biology": 81.9,
|
| 15 |
+
"college_chemistry": 58.0,
|
| 16 |
+
"college_computer_science": 64.0,
|
| 17 |
+
"college_mathematics": 53.0,
|
| 18 |
+
"college_physics": 51.0,
|
| 19 |
+
"computer_security": 57.0,
|
| 20 |
+
"conceptual_physics": 76.2,
|
| 21 |
+
"electrical_engineering": 67.6,
|
| 22 |
+
"elementary_mathematics": 53.7,
|
| 23 |
+
"high_school_biology": 86.5,
|
| 24 |
+
"high_school_chemistry": 72.4,
|
| 25 |
+
"high_school_computer_science": 68.0,
|
| 26 |
+
"high_school_mathematics": 30.0,
|
| 27 |
+
"high_school_physics": 64.2,
|
| 28 |
+
"high_school_statistics": 67.1,
|
| 29 |
+
"machine_learning": 65.2
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 605.1673429012299,
|
| 32 |
+
"time_use_in_minite": "10:05"
|
| 33 |
+
}
|
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 43.6,
|
| 7 |
+
"pass_acc": 43.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 43.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 614.8699924945831,
|
| 12 |
+
"time_use_in_minite": "10:14"
|
| 13 |
+
}
|
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 299.66179943084717,
|
| 12 |
+
"time_use_in_minite": "4:59"
|
| 13 |
+
}
|
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.7,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 270.207879781723,
|
| 12 |
+
"time_use_in_minite": "4:30"
|
| 13 |
+
}
|
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 67.5,
|
| 7 |
+
"pass_acc": 67.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 67.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 255.5747799873352,
|
| 12 |
+
"time_use_in_minite": "4:15"
|
| 13 |
+
}
|
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 93.6,
|
| 7 |
+
"pass_acc": 93.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 295.32457733154297,
|
| 12 |
+
"time_use_in_minite": "4:55"
|
| 13 |
+
}
|
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 86.0,
|
| 7 |
+
"pass_acc": 86.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 86.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 437.3554677963257,
|
| 12 |
+
"time_use_in_minite": "7:17"
|
| 13 |
+
}
|
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 37.9,
|
| 7 |
+
"pass_acc": 37.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 37.9
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 62.5,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 32.1,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 18.2,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 61.1,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 54.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 277.40999937057495,
|
| 23 |
+
"time_use_in_minite": "4:37"
|
| 24 |
+
}
|
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 77.6,
|
| 7 |
+
"pass_acc": 77.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 77.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 78.0,
|
| 13 |
+
"astronomy": 80.3,
|
| 14 |
+
"college_biology": 88.2,
|
| 15 |
+
"college_chemistry": 60.0,
|
| 16 |
+
"college_computer_science": 77.0,
|
| 17 |
+
"college_mathematics": 74.0,
|
| 18 |
+
"college_physics": 72.5,
|
| 19 |
+
"computer_security": 58.0,
|
| 20 |
+
"conceptual_physics": 83.0,
|
| 21 |
+
"electrical_engineering": 70.3,
|
| 22 |
+
"elementary_mathematics": 82.5,
|
| 23 |
+
"high_school_biology": 88.7,
|
| 24 |
+
"high_school_chemistry": 79.8,
|
| 25 |
+
"high_school_computer_science": 75.0,
|
| 26 |
+
"high_school_mathematics": 74.8,
|
| 27 |
+
"high_school_physics": 75.5,
|
| 28 |
+
"high_school_statistics": 75.0,
|
| 29 |
+
"machine_learning": 65.2
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 930.7638278007507,
|
| 32 |
+
"time_use_in_minite": "15:30"
|
| 33 |
+
}
|
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 50.4,
|
| 7 |
+
"pass_acc": 50.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 50.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 1464.0274574756622,
|
| 12 |
+
"time_use_in_minite": "24:24"
|
| 13 |
+
}
|
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 313.1780161857605,
|
| 12 |
+
"time_use_in_minite": "5:13"
|
| 13 |
+
}
|
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 23.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 23.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 267.39144802093506,
|
| 12 |
+
"time_use_in_minite": "4:27"
|
| 13 |
+
}
|
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 62.5,
|
| 7 |
+
"pass_acc": 62.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 62.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 253.86100006103516,
|
| 12 |
+
"time_use_in_minite": "4:13"
|
| 13 |
+
}
|