Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- eval_results/global_step_50/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_90/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results/global_step_50/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.0,
|
| 7 |
+
"pass_acc": 0.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 39.98672890663147,
|
| 12 |
+
"time_use_in_minite": "0:39"
|
| 13 |
+
}
|
eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 42.5,
|
| 7 |
+
"pass_acc": 42.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 42.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 10.751169919967651,
|
| 12 |
+
"time_use_in_minite": "0:10"
|
| 13 |
+
}
|
eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 82.3,
|
| 7 |
+
"pass_acc": 82.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 82.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 226.53313755989075,
|
| 12 |
+
"time_use_in_minite": "3:46"
|
| 13 |
+
}
|
eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 61.8,
|
| 7 |
+
"pass_acc": 61.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 61.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 215.82098245620728,
|
| 12 |
+
"time_use_in_minite": "3:35"
|
| 13 |
+
}
|
eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.5,
|
| 7 |
+
"pass_acc": 26.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 47.9,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 20.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 17.0,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 16.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 0.0,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 44.4,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 18.2
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 171.99901270866394,
|
| 23 |
+
"time_use_in_minite": "2:51"
|
| 24 |
+
}
|
eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 55.5,
|
| 7 |
+
"pass_acc": 55.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 55.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 47.0,
|
| 13 |
+
"astronomy": 61.2,
|
| 14 |
+
"college_biology": 60.4,
|
| 15 |
+
"college_chemistry": 44.0,
|
| 16 |
+
"college_computer_science": 45.0,
|
| 17 |
+
"college_mathematics": 38.0,
|
| 18 |
+
"college_physics": 49.0,
|
| 19 |
+
"computer_security": 56.0,
|
| 20 |
+
"conceptual_physics": 59.1,
|
| 21 |
+
"electrical_engineering": 56.6,
|
| 22 |
+
"elementary_mathematics": 67.7,
|
| 23 |
+
"high_school_biology": 62.6,
|
| 24 |
+
"high_school_chemistry": 60.6,
|
| 25 |
+
"high_school_computer_science": 64.0,
|
| 26 |
+
"high_school_mathematics": 35.2,
|
| 27 |
+
"high_school_physics": 56.3,
|
| 28 |
+
"high_school_statistics": 53.7,
|
| 29 |
+
"machine_learning": 53.6
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 434.28683280944824,
|
| 32 |
+
"time_use_in_minite": "7:14"
|
| 33 |
+
}
|
eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 28.1,
|
| 7 |
+
"pass_acc": 28.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 28.1
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 391.85224533081055,
|
| 12 |
+
"time_use_in_minite": "6:31"
|
| 13 |
+
}
|
eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 168.11040616035461,
|
| 12 |
+
"time_use_in_minite": "2:48"
|
| 13 |
+
}
|
eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 153.61491012573242,
|
| 12 |
+
"time_use_in_minite": "2:33"
|
| 13 |
+
}
|
eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 25.0,
|
| 7 |
+
"pass_acc": 25.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 25.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 157.09997868537903,
|
| 12 |
+
"time_use_in_minite": "2:37"
|
| 13 |
+
}
|
eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 82.0,
|
| 7 |
+
"pass_acc": 82.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 82.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 202.64411163330078,
|
| 12 |
+
"time_use_in_minite": "3:22"
|
| 13 |
+
}
|
eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 64.4,
|
| 7 |
+
"pass_acc": 64.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 64.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 207.95617079734802,
|
| 12 |
+
"time_use_in_minite": "3:27"
|
| 13 |
+
}
|
eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 24.3,
|
| 7 |
+
"pass_acc": 24.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 24.3
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 45.8,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 38.5,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 0.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 17.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 18.2,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 38.9,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 0.0
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 182.77215790748596,
|
| 23 |
+
"time_use_in_minite": "3:02"
|
| 24 |
+
}
|
eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 4,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 59.4,
|
| 7 |
+
"pass_acc": 59.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 59.4
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 44.0,
|
| 13 |
+
"astronomy": 63.2,
|
| 14 |
+
"college_biology": 66.0,
|
| 15 |
+
"college_chemistry": 49.0,
|
| 16 |
+
"college_computer_science": 51.0,
|
| 17 |
+
"college_mathematics": 55.0,
|
| 18 |
+
"college_physics": 54.9,
|
| 19 |
+
"computer_security": 59.0,
|
| 20 |
+
"conceptual_physics": 67.2,
|
| 21 |
+
"electrical_engineering": 57.9,
|
| 22 |
+
"elementary_mathematics": 73.5,
|
| 23 |
+
"high_school_biology": 70.0,
|
| 24 |
+
"high_school_chemistry": 56.7,
|
| 25 |
+
"high_school_computer_science": 70.0,
|
| 26 |
+
"high_school_mathematics": 39.6,
|
| 27 |
+
"high_school_physics": 56.3,
|
| 28 |
+
"high_school_statistics": 55.1,
|
| 29 |
+
"machine_learning": 50.0
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 436.29017210006714,
|
| 32 |
+
"time_use_in_minite": "7:16"
|
| 33 |
+
}
|
eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.4,
|
| 7 |
+
"pass_acc": 26.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 408.1657428741455,
|
| 12 |
+
"time_use_in_minite": "6:48"
|
| 13 |
+
}
|
eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.0,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 159.29473733901978,
|
| 12 |
+
"time_use_in_minite": "2:39"
|
| 13 |
+
}
|
eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.0,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 133.72769832611084,
|
| 12 |
+
"time_use_in_minite": "2:13"
|
| 13 |
+
}
|
eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 42.5,
|
| 7 |
+
"pass_acc": 42.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 42.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 153.2292251586914,
|
| 12 |
+
"time_use_in_minite": "2:33"
|
| 13 |
+
}
|
eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 82.6,
|
| 7 |
+
"pass_acc": 82.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 82.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 195.806147813797,
|
| 12 |
+
"time_use_in_minite": "3:15"
|
| 13 |
+
}
|
eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 64.4,
|
| 7 |
+
"pass_acc": 64.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 64.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 252.0947515964508,
|
| 12 |
+
"time_use_in_minite": "4:12"
|
| 13 |
+
}
|
eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.5,
|
| 7 |
+
"pass_acc": 26.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 54.2,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 34.6,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 15.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 0.0,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 44.4,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 27.3
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 177.2519736289978,
|
| 23 |
+
"time_use_in_minite": "2:57"
|
| 24 |
+
}
|
eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 60.7,
|
| 7 |
+
"pass_acc": 60.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 60.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 48.0,
|
| 13 |
+
"astronomy": 61.8,
|
| 14 |
+
"college_biology": 68.8,
|
| 15 |
+
"college_chemistry": 53.0,
|
| 16 |
+
"college_computer_science": 57.0,
|
| 17 |
+
"college_mathematics": 47.0,
|
| 18 |
+
"college_physics": 55.9,
|
| 19 |
+
"computer_security": 60.0,
|
| 20 |
+
"conceptual_physics": 64.7,
|
| 21 |
+
"electrical_engineering": 53.8,
|
| 22 |
+
"elementary_mathematics": 79.6,
|
| 23 |
+
"high_school_biology": 68.7,
|
| 24 |
+
"high_school_chemistry": 62.6,
|
| 25 |
+
"high_school_computer_science": 69.0,
|
| 26 |
+
"high_school_mathematics": 41.9,
|
| 27 |
+
"high_school_physics": 57.0,
|
| 28 |
+
"high_school_statistics": 55.1,
|
| 29 |
+
"machine_learning": 52.7
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 415.04196524620056,
|
| 32 |
+
"time_use_in_minite": "6:55"
|
| 33 |
+
}
|
eval_results/global_step_90/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|