Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
.gitattributes
CHANGED
|
@@ -47,3 +47,7 @@ global_step_115/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -
|
|
| 47 |
global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 48 |
global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 49 |
global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 48 |
global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 49 |
global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
global_step_95/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
global_step_100/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
global_step_40/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 20,
|
| 6 |
+
"acc": 75.7,
|
| 7 |
+
"pass_acc": 75.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 75.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 125.95953345298767,
|
| 12 |
+
"time_use_in_minite": "2:05"
|
| 13 |
+
}
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 9,
|
| 6 |
+
"acc": 52.2,
|
| 7 |
+
"pass_acc": 52.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 94.54320168495178,
|
| 12 |
+
"time_use_in_minite": "1:34"
|
| 13 |
+
}
|
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 18,
|
| 6 |
+
"acc": 20.6,
|
| 7 |
+
"pass_acc": 20.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 200.8112862110138,
|
| 12 |
+
"time_use_in_minite": "3:20"
|
| 13 |
+
}
|
eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 34.98631310462952,
|
| 12 |
+
"time_use_in_minite": "0:34"
|
| 13 |
+
}
|
eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.5,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 68.07727694511414,
|
| 12 |
+
"time_use_in_minite": "1:08"
|
| 13 |
+
}
|
eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 90.2,
|
| 7 |
+
"pass_acc": 90.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 90.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 63.200294494628906,
|
| 12 |
+
"time_use_in_minite": "1:03"
|
| 13 |
+
}
|
eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 72.4,
|
| 7 |
+
"pass_acc": 72.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 72.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 57.943002700805664,
|
| 12 |
+
"time_use_in_minite": "0:57"
|
| 13 |
+
}
|
eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 35.7,
|
| 7 |
+
"pass_acc": 35.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 35.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 64.6,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 28.3,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 21.6,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 55.6,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 36.4
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 84.02002787590027,
|
| 23 |
+
"time_use_in_minite": "1:24"
|
| 24 |
+
}
|
eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_15/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 36.3,
|
| 7 |
+
"pass_acc": 36.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 36.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 138.21381092071533,
|
| 12 |
+
"time_use_in_minite": "2:18"
|
| 13 |
+
}
|
eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.0,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 68.03684830665588,
|
| 12 |
+
"time_use_in_minite": "1:08"
|
| 13 |
+
}
|
eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.5,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 30.228216886520386,
|
| 12 |
+
"time_use_in_minite": "0:30"
|
| 13 |
+
}
|
eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 91.3,
|
| 7 |
+
"pass_acc": 91.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 91.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 61.750991344451904,
|
| 12 |
+
"time_use_in_minite": "1:01"
|
| 13 |
+
}
|
eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 73.4,
|
| 7 |
+
"pass_acc": 73.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 73.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 102.64481163024902,
|
| 12 |
+
"time_use_in_minite": "1:42"
|
| 13 |
+
}
|
eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 32.7,
|
| 7 |
+
"pass_acc": 32.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 32.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 58.3,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 53.8,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 23.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 55.6,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 27.3
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 36.20563268661499,
|
| 23 |
+
"time_use_in_minite": "0:36"
|
| 24 |
+
}
|
eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_25/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 38.5,
|
| 7 |
+
"pass_acc": 38.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 38.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 137.9393606185913,
|
| 12 |
+
"time_use_in_minite": "2:17"
|
| 13 |
+
}
|
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 67.79033923149109,
|
| 12 |
+
"time_use_in_minite": "1:07"
|
| 13 |
+
}
|
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.5,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 22.189570665359497,
|
| 12 |
+
"time_use_in_minite": "0:22"
|
| 13 |
+
}
|
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 89.9,
|
| 7 |
+
"pass_acc": 89.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 89.9
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 110.01273012161255,
|
| 12 |
+
"time_use_in_minite": "1:50"
|
| 13 |
+
}
|
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 76.0,
|
| 7 |
+
"pass_acc": 76.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 76.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 101.99508786201477,
|
| 12 |
+
"time_use_in_minite": "1:41"
|
| 13 |
+
}
|
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 33.8,
|
| 7 |
+
"pass_acc": 33.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 33.8
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 62.5,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 20.8,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 50.0,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 36.4
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 36.29343605041504,
|
| 23 |
+
"time_use_in_minite": "0:36"
|
| 24 |
+
}
|
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 36.6,
|
| 7 |
+
"pass_acc": 36.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 36.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 140.6587872505188,
|
| 12 |
+
"time_use_in_minite": "2:20"
|
| 13 |
+
}
|
eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_35/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 66.62974858283997,
|
| 12 |
+
"time_use_in_minite": "1:06"
|
| 13 |
+
}
|
eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_35/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 55.0,
|
| 7 |
+
"pass_acc": 55.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 55.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 19.278455018997192,
|
| 12 |
+
"time_use_in_minite": "0:19"
|
| 13 |
+
}
|
eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_35/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 91.3,
|
| 7 |
+
"pass_acc": 91.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 91.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 64.08755540847778,
|
| 12 |
+
"time_use_in_minite": "1:04"
|
| 13 |
+
}
|