Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 79.8,
|
| 7 |
+
"pass_acc": 79.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 79.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 68.8884105682373,
|
| 12 |
+
"time_use_in_minite": "1:08"
|
| 13 |
+
}
|
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 57.4,
|
| 7 |
+
"pass_acc": 57.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 57.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 188.7331085205078,
|
| 12 |
+
"time_use_in_minite": "3:08"
|
| 13 |
+
}
|
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 24.6,
|
| 7 |
+
"pass_acc": 24.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 24.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 45.8,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 15.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 38.9,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 0.0
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 168.47852659225464,
|
| 23 |
+
"time_use_in_minite": "2:48"
|
| 24 |
+
}
|
eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 5,
|
| 5 |
+
"empty_samples": 3,
|
| 6 |
+
"acc": 45.9,
|
| 7 |
+
"pass_acc": 45.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 45.9
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 25.0,
|
| 13 |
+
"astronomy": 56.6,
|
| 14 |
+
"college_biology": 64.6,
|
| 15 |
+
"college_chemistry": 49.0,
|
| 16 |
+
"college_computer_science": 39.0,
|
| 17 |
+
"college_mathematics": 29.0,
|
| 18 |
+
"college_physics": 41.2,
|
| 19 |
+
"computer_security": 52.0,
|
| 20 |
+
"conceptual_physics": 56.6,
|
| 21 |
+
"electrical_engineering": 46.9,
|
| 22 |
+
"elementary_mathematics": 41.3,
|
| 23 |
+
"high_school_biology": 64.2,
|
| 24 |
+
"high_school_chemistry": 52.7,
|
| 25 |
+
"high_school_computer_science": 52.0,
|
| 26 |
+
"high_school_mathematics": 14.8,
|
| 27 |
+
"high_school_physics": 43.7,
|
| 28 |
+
"high_school_statistics": 48.1,
|
| 29 |
+
"machine_learning": 39.3
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 429.1207802295685,
|
| 32 |
+
"time_use_in_minite": "7:09"
|
| 33 |
+
}
|
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 24.3,
|
| 7 |
+
"pass_acc": 24.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 24.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 389.05578565597534,
|
| 12 |
+
"time_use_in_minite": "6:29"
|
| 13 |
+
}
|
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 138.837792634964,
|
| 12 |
+
"time_use_in_minite": "2:18"
|
| 13 |
+
}
|
eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 32.871572732925415,
|
| 12 |
+
"time_use_in_minite": "0:32"
|
| 13 |
+
}
|
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 30.0,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 30.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 136.81593942642212,
|
| 12 |
+
"time_use_in_minite": "2:16"
|
| 13 |
+
}
|
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 80.1,
|
| 7 |
+
"pass_acc": 80.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 80.1
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 194.5436201095581,
|
| 12 |
+
"time_use_in_minite": "3:14"
|
| 13 |
+
}
|
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 59.4,
|
| 7 |
+
"pass_acc": 59.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 59.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 208.02852749824524,
|
| 12 |
+
"time_use_in_minite": "3:28"
|
| 13 |
+
}
|
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 22.1,
|
| 7 |
+
"pass_acc": 22.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 22.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 45.8,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 34.6,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 0.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 0.0,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 11.3,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 18.6,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 0.0,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 27.8,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 0.0
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 167.37208485603333,
|
| 23 |
+
"time_use_in_minite": "2:47"
|
| 24 |
+
}
|
eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 5,
|
| 6 |
+
"acc": 51.0,
|
| 7 |
+
"pass_acc": 51.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 51.0
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 38.0,
|
| 13 |
+
"astronomy": 63.8,
|
| 14 |
+
"college_biology": 62.5,
|
| 15 |
+
"college_chemistry": 44.0,
|
| 16 |
+
"college_computer_science": 44.0,
|
| 17 |
+
"college_mathematics": 31.0,
|
| 18 |
+
"college_physics": 50.0,
|
| 19 |
+
"computer_security": 46.0,
|
| 20 |
+
"conceptual_physics": 60.0,
|
| 21 |
+
"electrical_engineering": 53.8,
|
| 22 |
+
"elementary_mathematics": 48.9,
|
| 23 |
+
"high_school_biology": 66.8,
|
| 24 |
+
"high_school_chemistry": 53.7,
|
| 25 |
+
"high_school_computer_science": 61.0,
|
| 26 |
+
"high_school_mathematics": 26.3,
|
| 27 |
+
"high_school_physics": 45.7,
|
| 28 |
+
"high_school_statistics": 58.8,
|
| 29 |
+
"machine_learning": 44.6
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 478.0683960914612,
|
| 32 |
+
"time_use_in_minite": "7:58"
|
| 33 |
+
}
|
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 27.3,
|
| 7 |
+
"pass_acc": 27.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 27.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 371.59140634536743,
|
| 12 |
+
"time_use_in_minite": "6:11"
|
| 13 |
+
}
|
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 154.73073053359985,
|
| 12 |
+
"time_use_in_minite": "2:34"
|
| 13 |
+
}
|
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.0,
|
| 7 |
+
"pass_acc": 0.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 150.83997917175293,
|
| 12 |
+
"time_use_in_minite": "2:30"
|
| 13 |
+
}
|
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 27.5,
|
| 7 |
+
"pass_acc": 27.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 27.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 152.0974292755127,
|
| 12 |
+
"time_use_in_minite": "2:32"
|
| 13 |
+
}
|
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 81.3,
|
| 7 |
+
"pass_acc": 81.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 81.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 190.61600041389465,
|
| 12 |
+
"time_use_in_minite": "3:10"
|
| 13 |
+
}
|
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 64.8,
|
| 7 |
+
"pass_acc": 64.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 64.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 218.48642587661743,
|
| 12 |
+
"time_use_in_minite": "3:38"
|
| 13 |
+
}
|
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 27.2,
|
| 7 |
+
"pass_acc": 27.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 27.2
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 54.2,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 20.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 0.0,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 17.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 55.6,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 0.0
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 177.72079944610596,
|
| 23 |
+
"time_use_in_minite": "2:57"
|
| 24 |
+
}
|
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 54.8,
|
| 7 |
+
"pass_acc": 54.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 54.8
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 44.0,
|
| 13 |
+
"astronomy": 67.8,
|
| 14 |
+
"college_biology": 61.8,
|
| 15 |
+
"college_chemistry": 51.0,
|
| 16 |
+
"college_computer_science": 53.0,
|
| 17 |
+
"college_mathematics": 38.0,
|
| 18 |
+
"college_physics": 52.9,
|
| 19 |
+
"computer_security": 58.0,
|
| 20 |
+
"conceptual_physics": 57.0,
|
| 21 |
+
"electrical_engineering": 61.4,
|
| 22 |
+
"elementary_mathematics": 56.9,
|
| 23 |
+
"high_school_biology": 68.7,
|
| 24 |
+
"high_school_chemistry": 56.7,
|
| 25 |
+
"high_school_computer_science": 71.0,
|
| 26 |
+
"high_school_mathematics": 30.4,
|
| 27 |
+
"high_school_physics": 51.0,
|
| 28 |
+
"high_school_statistics": 51.9,
|
| 29 |
+
"machine_learning": 49.1
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 406.9477262496948,
|
| 32 |
+
"time_use_in_minite": "6:46"
|
| 33 |
+
}
|
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 28.6,
|
| 7 |
+
"pass_acc": 28.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 28.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 448.2467827796936,
|
| 12 |
+
"time_use_in_minite": "7:28"
|
| 13 |
+
}
|
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 140.95101380348206,
|
| 12 |
+
"time_use_in_minite": "2:20"
|
| 13 |
+
}
|
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 134.8092610836029,
|
| 12 |
+
"time_use_in_minite": "2:14"
|
| 13 |
+
}
|