Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +43 -0
- eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_110/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_120/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
.gitattributes
CHANGED
|
@@ -38,3 +38,46 @@ global_step_60/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -t
|
|
| 38 |
global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
global_step_20/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
global_step_20/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
eval_results_avg32/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
eval_results_avg32/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
eval_results_avg32/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
eval_results_avg32/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
eval_results_avg32/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
eval_results_avg32/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
eval_results_avg32/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
eval_results_avg32/plots/eval_results_avg32_avg_stop_tokens.png filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
eval_results_avg32/plots/eval_results_avg32_repeat_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
eval_results_avg32/plots/eval_results_avg32_tokens_keywords.png filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
eval_results_avg32/plots/eval_results_avg32_acc_tokens.png filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
eval_results_avg32/plots/eval_results_avg32_clip_ratio.png filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
eval_results_avg32/plots/eval_results_avg32_correct_tokens.png filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
eval_results_avg32/plots/eval_results_avg32_wrong_tokens.png filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
eval_results_avg32/plots/eval_results_avg32_acc_keywords.png filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
eval_results_avg32/plots/eval_results_avg32_box_ratio_and_token_length.png filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
eval_results_avg32/plots/eval_results_avg32_acc_pass_acc.png filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
eval_results_avg32/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
eval_results_avg32/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
eval_results_avg32/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
eval_results_avg32/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
eval_results_avg32/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
eval_results_avg32/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
global_step_100/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.7,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 286.882128238678,
|
| 12 |
+
"time_use_in_minite": "4:46"
|
| 13 |
+
}
|
eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 254.04739665985107,
|
| 12 |
+
"time_use_in_minite": "4:14"
|
| 13 |
+
}
|
eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 70.0,
|
| 7 |
+
"pass_acc": 70.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 70.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 166.89173412322998,
|
| 12 |
+
"time_use_in_minite": "2:46"
|
| 13 |
+
}
|
eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 93.6,
|
| 7 |
+
"pass_acc": 93.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 154.87166595458984,
|
| 12 |
+
"time_use_in_minite": "2:34"
|
| 13 |
+
}
|
eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 85.6,
|
| 7 |
+
"pass_acc": 85.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 467.85400772094727,
|
| 12 |
+
"time_use_in_minite": "7:47"
|
| 13 |
+
}
|
eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 39.7,
|
| 7 |
+
"pass_acc": 39.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 39.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 62.5,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 53.8,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 100.0,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 28.3,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 36.4,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 72.2,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 45.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 129.5655951499939,
|
| 23 |
+
"time_use_in_minite": "2:09"
|
| 24 |
+
}
|
eval_results/global_step_110/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 78.4,
|
| 7 |
+
"pass_acc": 78.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 78.4
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 75.0,
|
| 13 |
+
"astronomy": 82.2,
|
| 14 |
+
"college_biology": 86.8,
|
| 15 |
+
"college_chemistry": 64.0,
|
| 16 |
+
"college_computer_science": 80.0,
|
| 17 |
+
"college_mathematics": 75.0,
|
| 18 |
+
"college_physics": 72.5,
|
| 19 |
+
"computer_security": 61.0,
|
| 20 |
+
"conceptual_physics": 81.3,
|
| 21 |
+
"electrical_engineering": 70.3,
|
| 22 |
+
"elementary_mathematics": 84.4,
|
| 23 |
+
"high_school_biology": 88.4,
|
| 24 |
+
"high_school_chemistry": 81.8,
|
| 25 |
+
"high_school_computer_science": 75.0,
|
| 26 |
+
"high_school_mathematics": 75.9,
|
| 27 |
+
"high_school_physics": 70.2,
|
| 28 |
+
"high_school_statistics": 79.2,
|
| 29 |
+
"machine_learning": 70.5
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 922.6077558994293,
|
| 32 |
+
"time_use_in_minite": "15:22"
|
| 33 |
+
}
|
eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_110/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.0,
|
| 7 |
+
"pass_acc": 52.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 1118.1542921066284,
|
| 12 |
+
"time_use_in_minite": "18:38"
|
| 13 |
+
}
|
eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 279.13312292099,
|
| 12 |
+
"time_use_in_minite": "4:39"
|
| 13 |
+
}
|
eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 231.71165895462036,
|
| 12 |
+
"time_use_in_minite": "3:51"
|
| 13 |
+
}
|
eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 67.5,
|
| 7 |
+
"pass_acc": 67.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 67.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 233.46848917007446,
|
| 12 |
+
"time_use_in_minite": "3:53"
|
| 13 |
+
}
|
eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 93.9,
|
| 7 |
+
"pass_acc": 93.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.9
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 361.95891976356506,
|
| 12 |
+
"time_use_in_minite": "6:01"
|
| 13 |
+
}
|
eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 86.0,
|
| 7 |
+
"pass_acc": 86.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 86.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 490.3726477622986,
|
| 12 |
+
"time_use_in_minite": "8:10"
|
| 13 |
+
}
|
eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 43.0,
|
| 7 |
+
"pass_acc": 43.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 43.0
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 66.7,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 61.5,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 32.1,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 27.3,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 66.7,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 54.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 290.67028999328613,
|
| 23 |
+
"time_use_in_minite": "4:50"
|
| 24 |
+
}
|
eval_results/global_step_120/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 77.5,
|
| 7 |
+
"pass_acc": 77.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 77.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 73.0,
|
| 13 |
+
"astronomy": 80.9,
|
| 14 |
+
"college_biology": 88.2,
|
| 15 |
+
"college_chemistry": 65.0,
|
| 16 |
+
"college_computer_science": 70.0,
|
| 17 |
+
"college_mathematics": 70.0,
|
| 18 |
+
"college_physics": 69.6,
|
| 19 |
+
"computer_security": 59.0,
|
| 20 |
+
"conceptual_physics": 83.0,
|
| 21 |
+
"electrical_engineering": 69.7,
|
| 22 |
+
"elementary_mathematics": 85.7,
|
| 23 |
+
"high_school_biology": 88.7,
|
| 24 |
+
"high_school_chemistry": 77.3,
|
| 25 |
+
"high_school_computer_science": 73.0,
|
| 26 |
+
"high_school_mathematics": 73.3,
|
| 27 |
+
"high_school_physics": 74.2,
|
| 28 |
+
"high_school_statistics": 77.8,
|
| 29 |
+
"machine_learning": 68.8
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 914.7490997314453,
|
| 32 |
+
"time_use_in_minite": "15:14"
|
| 33 |
+
}
|
eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_120/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.9,
|
| 7 |
+
"pass_acc": 52.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.9
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 1139.7145493030548,
|
| 12 |
+
"time_use_in_minite": "18:59"
|
| 13 |
+
}
|
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 23.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 23.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 332.4983825683594,
|
| 12 |
+
"time_use_in_minite": "5:32"
|
| 13 |
+
}
|
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 301.8720078468323,
|
| 12 |
+
"time_use_in_minite": "5:01"
|
| 13 |
+
}
|
eval_results/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 93.6,
|
| 7 |
+
"pass_acc": 93.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 149.86318922042847,
|
| 12 |
+
"time_use_in_minite": "2:29"
|
| 13 |
+
}
|
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 84.6,
|
| 7 |
+
"pass_acc": 84.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 84.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 586.838475227356,
|
| 12 |
+
"time_use_in_minite": "9:46"
|
| 13 |
+
}
|
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 79.5,
|
| 7 |
+
"pass_acc": 79.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 79.5
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 75.0,
|
| 13 |
+
"astronomy": 82.9,
|
| 14 |
+
"college_biology": 87.5,
|
| 15 |
+
"college_chemistry": 62.0,
|
| 16 |
+
"college_computer_science": 66.0,
|
| 17 |
+
"college_mathematics": 80.0,
|
| 18 |
+
"college_physics": 76.5,
|
| 19 |
+
"computer_security": 65.0,
|
| 20 |
+
"conceptual_physics": 81.3,
|
| 21 |
+
"electrical_engineering": 73.1,
|
| 22 |
+
"elementary_mathematics": 85.2,
|
| 23 |
+
"high_school_biology": 91.0,
|
| 24 |
+
"high_school_chemistry": 78.3,
|
| 25 |
+
"high_school_computer_science": 77.0,
|
| 26 |
+
"high_school_mathematics": 81.9,
|
| 27 |
+
"high_school_physics": 77.5,
|
| 28 |
+
"high_school_statistics": 77.8,
|
| 29 |
+
"machine_learning": 68.8
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 902.6363174915314,
|
| 32 |
+
"time_use_in_minite": "15:02"
|
| 33 |
+
}
|
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 48.1,
|
| 7 |
+
"pass_acc": 48.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 48.1
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 1647.6752095222473,
|
| 12 |
+
"time_use_in_minite": "27:27"
|
| 13 |
+
}
|
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 297.49846267700195,
|
| 12 |
+
"time_use_in_minite": "4:57"
|
| 13 |
+
}
|
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 23.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 23.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 271.2436902523041,
|
| 12 |
+
"time_use_in_minite": "4:31"
|
| 13 |
+
}
|
eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 75.0,
|
| 7 |
+
"pass_acc": 75.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 75.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 247.3225588798523,
|
| 12 |
+
"time_use_in_minite": "4:07"
|
| 13 |
+
}
|
eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|