Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 40.8,
|
| 7 |
+
"pass_acc": 40.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 40.8
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 64.6,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 57.7,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 32.1,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 27.3,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 61.1,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 36.4
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 56.283315658569336,
|
| 23 |
+
"time_use_in_minite": "0:56"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 71.3,
|
| 7 |
+
"pass_acc": 71.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 71.3
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 67.0,
|
| 13 |
+
"astronomy": 82.9,
|
| 14 |
+
"college_biology": 85.4,
|
| 15 |
+
"college_chemistry": 53.0,
|
| 16 |
+
"college_computer_science": 73.0,
|
| 17 |
+
"college_mathematics": 62.0,
|
| 18 |
+
"college_physics": 66.7,
|
| 19 |
+
"computer_security": 70.0,
|
| 20 |
+
"conceptual_physics": 84.3,
|
| 21 |
+
"electrical_engineering": 79.3,
|
| 22 |
+
"elementary_mathematics": 63.2,
|
| 23 |
+
"high_school_biology": 87.4,
|
| 24 |
+
"high_school_chemistry": 78.3,
|
| 25 |
+
"high_school_computer_science": 81.0,
|
| 26 |
+
"high_school_mathematics": 40.4,
|
| 27 |
+
"high_school_physics": 67.5,
|
| 28 |
+
"high_school_statistics": 71.8,
|
| 29 |
+
"machine_learning": 72.3
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 601.5261194705963,
|
| 32 |
+
"time_use_in_minite": "10:01"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 46.8,
|
| 7 |
+
"pass_acc": 46.8,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 46.8
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 845.5335533618927,
|
| 12 |
+
"time_use_in_minite": "14:05"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 23.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 23.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 158.98255920410156,
|
| 12 |
+
"time_use_in_minite": "2:38"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 237.56575345993042,
|
| 12 |
+
"time_use_in_minite": "3:57"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 75.0,
|
| 7 |
+
"pass_acc": 75.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 75.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 230.7889039516449,
|
| 12 |
+
"time_use_in_minite": "3:50"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 94.2,
|
| 7 |
+
"pass_acc": 94.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 94.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 94.08725070953369,
|
| 12 |
+
"time_use_in_minite": "1:34"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 82.6,
|
| 7 |
+
"pass_acc": 82.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 82.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 339.8502240180969,
|
| 12 |
+
"time_use_in_minite": "5:39"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 44.9,
|
| 7 |
+
"pass_acc": 44.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 44.9
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 66.7,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 65.4,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 39.6,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 24.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 45.5,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 77.8,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 36.4
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 45.92661666870117,
|
| 23 |
+
"time_use_in_minite": "0:45"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 86.1,
|
| 7 |
+
"pass_acc": 86.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 86.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 79.0,
|
| 13 |
+
"astronomy": 89.5,
|
| 14 |
+
"college_biology": 90.3,
|
| 15 |
+
"college_chemistry": 61.0,
|
| 16 |
+
"college_computer_science": 76.0,
|
| 17 |
+
"college_mathematics": 84.0,
|
| 18 |
+
"college_physics": 87.3,
|
| 19 |
+
"computer_security": 68.0,
|
| 20 |
+
"conceptual_physics": 86.4,
|
| 21 |
+
"electrical_engineering": 79.3,
|
| 22 |
+
"elementary_mathematics": 96.0,
|
| 23 |
+
"high_school_biology": 92.6,
|
| 24 |
+
"high_school_chemistry": 87.2,
|
| 25 |
+
"high_school_computer_science": 89.0,
|
| 26 |
+
"high_school_mathematics": 90.7,
|
| 27 |
+
"high_school_physics": 85.4,
|
| 28 |
+
"high_school_statistics": 83.3,
|
| 29 |
+
"machine_learning": 78.6
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 454.6443088054657,
|
| 32 |
+
"time_use_in_minite": "7:34"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 50.7,
|
| 7 |
+
"pass_acc": 50.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 50.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 702.8333060741425,
|
| 12 |
+
"time_use_in_minite": "11:42"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.7,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 261.83673620224,
|
| 12 |
+
"time_use_in_minite": "4:21"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 255.87846636772156,
|
| 12 |
+
"time_use_in_minite": "4:15"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 72.5,
|
| 7 |
+
"pass_acc": 72.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 72.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 195.93005347251892,
|
| 12 |
+
"time_use_in_minite": "3:15"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 93.3,
|
| 7 |
+
"pass_acc": 93.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 93.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 291.3635981082916,
|
| 12 |
+
"time_use_in_minite": "4:51"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 83.4,
|
| 7 |
+
"pass_acc": 83.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 83.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 398.33264088630676,
|
| 12 |
+
"time_use_in_minite": "6:38"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 39.3,
|
| 7 |
+
"pass_acc": 39.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 39.3
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 68.8,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 26.4,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 25.8,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 27.3,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 66.7,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 36.4
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 50.85315251350403,
|
| 23 |
+
"time_use_in_minite": "0:50"
|
| 24 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 5,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 80.6,
|
| 7 |
+
"pass_acc": 80.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 80.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 70.0,
|
| 13 |
+
"astronomy": 87.5,
|
| 14 |
+
"college_biology": 88.9,
|
| 15 |
+
"college_chemistry": 67.0,
|
| 16 |
+
"college_computer_science": 78.0,
|
| 17 |
+
"college_mathematics": 75.0,
|
| 18 |
+
"college_physics": 83.3,
|
| 19 |
+
"computer_security": 71.0,
|
| 20 |
+
"conceptual_physics": 83.4,
|
| 21 |
+
"electrical_engineering": 73.1,
|
| 22 |
+
"elementary_mathematics": 85.7,
|
| 23 |
+
"high_school_biology": 89.7,
|
| 24 |
+
"high_school_chemistry": 82.3,
|
| 25 |
+
"high_school_computer_science": 89.0,
|
| 26 |
+
"high_school_mathematics": 72.6,
|
| 27 |
+
"high_school_physics": 82.8,
|
| 28 |
+
"high_school_statistics": 75.9,
|
| 29 |
+
"machine_learning": 70.5
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 475.9500641822815,
|
| 32 |
+
"time_use_in_minite": "7:55"
|
| 33 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 47.0,
|
| 7 |
+
"pass_acc": 47.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 47.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 1089.8832597732544,
|
| 12 |
+
"time_use_in_minite": "18:09"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.7,
|
| 7 |
+
"pass_acc": 26.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 267.1386876106262,
|
| 12 |
+
"time_use_in_minite": "4:27"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 23.3,
|
| 7 |
+
"pass_acc": 23.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 23.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 231.50813817977905,
|
| 12 |
+
"time_use_in_minite": "3:51"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 67.5,
|
| 7 |
+
"pass_acc": 67.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 67.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 236.18571519851685,
|
| 12 |
+
"time_use_in_minite": "3:56"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 92.6,
|
| 7 |
+
"pass_acc": 92.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 92.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 101.05419993400574,
|
| 12 |
+
"time_use_in_minite": "1:41"
|
| 13 |
+
}
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reinforce_pp_deepmath_train_sample_6144_context_4k_Qwen3-8B-Base_max_response4096_batch1024_ppomini256_rollout8_vllm/eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 83.6,
|
| 7 |
+
"pass_acc": 83.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 83.6
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 381.99660444259644,
|
| 12 |
+
"time_use_in_minite": "6:21"
|
| 13 |
+
}
|