Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 4,
|
| 6 |
+
"acc": 85.6,
|
| 7 |
+
"pass_acc": 85.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 85.0,
|
| 13 |
+
"astronomy": 88.8,
|
| 14 |
+
"college_biology": 89.6,
|
| 15 |
+
"college_chemistry": 68.0,
|
| 16 |
+
"college_computer_science": 75.0,
|
| 17 |
+
"college_mathematics": 79.0,
|
| 18 |
+
"college_physics": 86.3,
|
| 19 |
+
"computer_security": 71.0,
|
| 20 |
+
"conceptual_physics": 86.8,
|
| 21 |
+
"electrical_engineering": 76.6,
|
| 22 |
+
"elementary_mathematics": 95.2,
|
| 23 |
+
"high_school_biology": 91.6,
|
| 24 |
+
"high_school_chemistry": 85.2,
|
| 25 |
+
"high_school_computer_science": 86.0,
|
| 26 |
+
"high_school_mathematics": 90.7,
|
| 27 |
+
"high_school_physics": 84.8,
|
| 28 |
+
"high_school_statistics": 81.9,
|
| 29 |
+
"machine_learning": 76.8
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 453.47122716903687,
|
| 32 |
+
"time_use_in_minite": "7:33"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 49.2,
|
| 7 |
+
"pass_acc": 49.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 49.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 739.9170608520508,
|
| 12 |
+
"time_use_in_minite": "12:19"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 30.0,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 30.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 256.68022561073303,
|
| 12 |
+
"time_use_in_minite": "4:16"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 244.1304202079773,
|
| 12 |
+
"time_use_in_minite": "4:04"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 65.0,
|
| 7 |
+
"pass_acc": 65.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 65.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 55.264809131622314,
|
| 12 |
+
"time_use_in_minite": "0:55"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 93.5,
|
| 7 |
+
"pass_acc": 93.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 102.33035564422607,
|
| 12 |
+
"time_use_in_minite": "1:42"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 84.0,
|
| 7 |
+
"pass_acc": 84.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 84.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 222.14642930030823,
|
| 12 |
+
"time_use_in_minite": "3:42"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 43.4,
|
| 7 |
+
"pass_acc": 43.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 43.4
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 72.9,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 50.0,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 34.0,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 26.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 27.3,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 72.2,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 45.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 236.5802080631256,
|
| 23 |
+
"time_use_in_minite": "3:56"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 3,
|
| 6 |
+
"acc": 85.6,
|
| 7 |
+
"pass_acc": 85.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 82.0,
|
| 13 |
+
"astronomy": 88.8,
|
| 14 |
+
"college_biology": 86.8,
|
| 15 |
+
"college_chemistry": 67.0,
|
| 16 |
+
"college_computer_science": 80.0,
|
| 17 |
+
"college_mathematics": 79.0,
|
| 18 |
+
"college_physics": 91.2,
|
| 19 |
+
"computer_security": 67.0,
|
| 20 |
+
"conceptual_physics": 85.1,
|
| 21 |
+
"electrical_engineering": 77.2,
|
| 22 |
+
"elementary_mathematics": 95.5,
|
| 23 |
+
"high_school_biology": 90.3,
|
| 24 |
+
"high_school_chemistry": 87.7,
|
| 25 |
+
"high_school_computer_science": 92.0,
|
| 26 |
+
"high_school_mathematics": 88.9,
|
| 27 |
+
"high_school_physics": 83.4,
|
| 28 |
+
"high_school_statistics": 83.3,
|
| 29 |
+
"machine_learning": 76.8
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 446.51267433166504,
|
| 32 |
+
"time_use_in_minite": "7:26"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 47.6,
|
| 7 |
+
"pass_acc": 47.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 47.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 731.3076951503754,
|
| 12 |
+
"time_use_in_minite": "12:11"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 30.0,
|
| 7 |
+
"pass_acc": 30.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 30.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 248.597638130188,
|
| 12 |
+
"time_use_in_minite": "4:08"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 104.9330325126648,
|
| 12 |
+
"time_use_in_minite": "1:44"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 70.0,
|
| 7 |
+
"pass_acc": 70.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 70.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 135.92350673675537,
|
| 12 |
+
"time_use_in_minite": "2:15"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 94.2,
|
| 7 |
+
"pass_acc": 94.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 94.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 92.96837067604065,
|
| 12 |
+
"time_use_in_minite": "1:32"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 85.6,
|
| 7 |
+
"pass_acc": 85.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 355.6146864891052,
|
| 12 |
+
"time_use_in_minite": "5:55"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 44.1,
|
| 7 |
+
"pass_acc": 44.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 44.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 66.7,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 53.8,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 37.7,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 27.3,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 77.8,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 45.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 127.21856451034546,
|
| 23 |
+
"time_use_in_minite": "2:07"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 85.4,
|
| 7 |
+
"pass_acc": 85.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.4
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 85.0,
|
| 13 |
+
"astronomy": 88.8,
|
| 14 |
+
"college_biology": 88.9,
|
| 15 |
+
"college_chemistry": 67.0,
|
| 16 |
+
"college_computer_science": 77.0,
|
| 17 |
+
"college_mathematics": 77.0,
|
| 18 |
+
"college_physics": 85.3,
|
| 19 |
+
"computer_security": 69.0,
|
| 20 |
+
"conceptual_physics": 86.0,
|
| 21 |
+
"electrical_engineering": 77.2,
|
| 22 |
+
"elementary_mathematics": 94.7,
|
| 23 |
+
"high_school_biology": 91.9,
|
| 24 |
+
"high_school_chemistry": 86.2,
|
| 25 |
+
"high_school_computer_science": 87.0,
|
| 26 |
+
"high_school_mathematics": 90.0,
|
| 27 |
+
"high_school_physics": 83.4,
|
| 28 |
+
"high_school_statistics": 82.4,
|
| 29 |
+
"machine_learning": 75.9
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 446.0400495529175,
|
| 32 |
+
"time_use_in_minite": "7:26"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_80/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 49.9,
|
| 7 |
+
"pass_acc": 49.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 49.9
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 766.2501382827759,
|
| 12 |
+
"time_use_in_minite": "12:46"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.7,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 244.0300760269165,
|
| 12 |
+
"time_use_in_minite": "4:04"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 253.21099305152893,
|
| 12 |
+
"time_use_in_minite": "4:13"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 70.0,
|
| 7 |
+
"pass_acc": 70.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 70.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 145.44669604301453,
|
| 12 |
+
"time_use_in_minite": "2:25"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 93.9,
|
| 7 |
+
"pass_acc": 93.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.9
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 92.97574210166931,
|
| 12 |
+
"time_use_in_minite": "1:32"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 85.8,
|
| 7 |
+
"pass_acc": 85.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 85.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 297.74292850494385,
|
| 12 |
+
"time_use_in_minite": "4:57"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 42.6,
|
| 7 |
+
"pass_acc": 42.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 42.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 66.7,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 57.7,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 28.3,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 18.2,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 77.8,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 54.5
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 58.997036933898926,
|
| 23 |
+
"time_use_in_minite": "0:58"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_90/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 86.1,
|
| 7 |
+
"pass_acc": 86.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 86.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 87.0,
|
| 13 |
+
"astronomy": 89.5,
|
| 14 |
+
"college_biology": 88.2,
|
| 15 |
+
"college_chemistry": 66.0,
|
| 16 |
+
"college_computer_science": 76.0,
|
| 17 |
+
"college_mathematics": 75.0,
|
| 18 |
+
"college_physics": 88.2,
|
| 19 |
+
"computer_security": 65.0,
|
| 20 |
+
"conceptual_physics": 88.9,
|
| 21 |
+
"electrical_engineering": 77.9,
|
| 22 |
+
"elementary_mathematics": 96.6,
|
| 23 |
+
"high_school_biology": 92.3,
|
| 24 |
+
"high_school_chemistry": 87.7,
|
| 25 |
+
"high_school_computer_science": 83.0,
|
| 26 |
+
"high_school_mathematics": 91.9,
|
| 27 |
+
"high_school_physics": 83.4,
|
| 28 |
+
"high_school_statistics": 84.3,
|
| 29 |
+
"machine_learning": 78.6
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 508.3278658390045,
|
| 32 |
+
"time_use_in_minite": "8:28"
|
| 33 |
+
}
|