Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results/eval_results.csv +14 -0
- eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results_ood/global_step_0/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/2025-07-14T12-41-17.800843/details_lighteval|gpqa:diamond|0_2025-07-14T12-41-17.800843.csv +0 -0
- eval_results_ood/global_step_0/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/2025-07-17T15-29-55.092392/details_extended|ifeval|0_2025-07-17T15-29-55.092392.csv +0 -0
- eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-14T12-41-17.800843.json +114 -0
- eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-17T15-29-55.092392.json +141 -0
- eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-17T21-16-09.926343.json +114 -0
- eval_results_ood/global_step_10/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/2025-07-14T12-45-55.307385/details_lighteval|gpqa:diamond|0_2025-07-14T12-45-55.307385.csv +0 -0
- eval_results_ood/global_step_10/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/2025-07-17T15-27-34.486497/details_extended|ifeval|0_2025-07-17T15-27-34.486497.csv +0 -0
- eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-14T12-45-55.307385.json +114 -0
- eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-17T15-27-34.486497.json +141 -0
- eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-17T21-12-43.036251.json +114 -0
- eval_results_ood/global_step_100/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/2025-07-14T12-58-50.192486/details_lighteval|gpqa:diamond|0_2025-07-14T12-58-50.192486.csv +0 -0
- eval_results_ood/global_step_100/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/2025-07-17T15-57-23.838956/details_extended|ifeval|0_2025-07-17T15-57-23.838956.csv +0 -0
- eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-14T12-58-50.192486.json +114 -0
- eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-17T15-57-23.838956.json +141 -0
- eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-17T22-59-45.311675.json +114 -0
- eval_results_ood/global_step_110/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/2025-07-14T13-00-52.252218/details_lighteval|gpqa:diamond|0_2025-07-14T13-00-52.252218.csv +0 -0
- eval_results_ood/global_step_110/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/2025-07-17T15-52-51.610154/details_extended|ifeval|0_2025-07-17T15-52-51.610154.csv +0 -0
- eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-14T13-00-52.252218.json +114 -0
- eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-17T15-52-51.610154.json +141 -0
- eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-17T22-57-39.607069.json +114 -0
- eval_results_ood/global_step_120/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/2025-07-14T13-13-03.024786/details_lighteval|gpqa:diamond|0_2025-07-14T13-13-03.024786.csv +0 -0
- eval_results_ood/global_step_120/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/2025-07-17T16-25-43.731519/details_extended|ifeval|0_2025-07-17T16-25-43.731519.csv +0 -0
- eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-14T13-13-03.024786.json +114 -0
- eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-17T16-25-43.731519.json +141 -0
- eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-17T23-21-18.939034.json +114 -0
- eval_results_ood/global_step_20/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/2025-07-14T12-46-08.302506/details_lighteval|gpqa:diamond|0_2025-07-14T12-46-08.302506.csv +0 -0
- eval_results_ood/global_step_20/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/2025-07-17T15-33-20.611749/details_extended|ifeval|0_2025-07-17T15-33-20.611749.csv +0 -0
- eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-14T12-46-08.302506.json +114 -0
- eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-17T15-33-20.611749.json +141 -0
- eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-17T21-33-30.966895.json +114 -0
- eval_results_ood/global_step_30/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/2025-07-14T12-41-43.868511/details_lighteval|gpqa:diamond|0_2025-07-14T12-41-43.868511.csv +0 -0
- eval_results_ood/global_step_30/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/2025-07-17T15-22-22.924052/details_extended|ifeval|0_2025-07-17T15-22-22.924052.csv +0 -0
- eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-14T12-41-43.868511.json +114 -0
- eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-17T15-22-22.924052.json +141 -0
- eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-17T21-33-36.117826.json +114 -0
- eval_results_ood/global_step_40/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/2025-07-14T12-49-47.596528/details_lighteval|gpqa:diamond|0_2025-07-14T12-49-47.596528.csv +0 -0
- eval_results_ood/global_step_40/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/2025-07-17T15-23-49.905332/details_extended|ifeval|0_2025-07-17T15-23-49.905332.csv +0 -0
- eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-14T12-49-47.596528.json +114 -0
- eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-17T15-23-49.905332.json +141 -0
- eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-17T21-52-17.853514.json +114 -0
- eval_results_ood/global_step_50/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/2025-07-14T12-55-00.091779/details_lighteval|gpqa:diamond|0_2025-07-14T12-55-00.091779.csv +0 -0
- eval_results_ood/global_step_50/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/2025-07-17T15-24-59.219716/details_extended|ifeval|0_2025-07-17T15-24-59.219716.csv +0 -0
- eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-14T12-55-00.091779.json +114 -0
- eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-17T15-24-59.219716.json +141 -0
- eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-17T22-00-10.015386.json +114 -0
- eval_results_ood/global_step_60/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_60_actor_huggingface/2025-07-14T12-55-24.416815/details_lighteval|gpqa:diamond|0_2025-07-14T12-55-24.416815.csv +0 -0
eval_results/eval_results.csv
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,mmlu_stem_acc,mmlu_stem_pass_acc,mmlu_stem_tokens,mmlu_stem_keywords,mmlu_stem_correct_tokens,mmlu_stem_wrong_tokens,mmlu_stem_clip_ratio,mmlu_stem_stop_tokens,mmlu_stem_stop_ratio,mmlu_stem_box_ratio,mmlu_stem_repeat_ratio,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results-global_step_0,26.1,26.1,724.5992647058823,0.11764705882352941,988.7887323943662,631.2786069651742,0.007352941176470588,608.4481481481481,0.9926470588235294,0.9375,0.3897058823529412,45.0,45.0,990.275,0.675,987.2222222222222,992.7727272727273,0.0,990.275,1.0,1.0,0.775,60.7,60.7,373.0622929092114,0.26673293571901924,319.14020731042007,456.47088607594935,0.004638833664678595,301.13848202396804,0.9953611663353215,0.9055666003976143,0.4592445328031809,6.7,6.7,2314.4333333333334,7.666666666666667,1565.5,2367.9285714285716,0.06666666666666667,1350.892857142857,0.9333333333333333,0.9333333333333333,0.9,13.3,13.3,1927.8333333333333,1.8,5070.0,1444.423076923077,0.03333333333333333,1442.655172413793,0.9666666666666667,0.9333333333333333,0.8,84.3,84.3,336.931766489765,0.04927975739196361,304.51079136690646,511.0966183574879,0.004548900682335102,266.7989337395278,0.9954510993176648,0.9575435936315391,0.17816527672479152,65.4,65.4,879.99,0.92,530.434250764526,1540.7109826589594,0.018,602.5336048879838,0.982,0.964,0.448,31.1,31.1,1166.722962962963,1.2696296296296297,866.952380952381,1302.1032258064515,0.011851851851851851,983.7991004497751,0.9881481481481481,0.9674074074074074,0.6266666666666667,41.575,41.575,1089.230994216811,1.5956195060288512,1329.068573126353,1155.8480869360496,0.018299065921917018,818.3176623507566,0.9817009340780829,0.9498355335129034,0.5720977948184475
|
| 3 |
+
eval_results-global_step_10,32.7,32.7,753.8308823529412,0.07352941176470588,513.1685393258427,870.8743169398907,0.007352941176470588,638.6592592592592,0.9926470588235294,0.9926470588235294,0.45955882352941174,52.5,52.5,1412.525,1.225,924.047619047619,1952.421052631579,0.025,1038.5897435897436,0.975,0.975,0.625,64.5,64.5,459.8711066931743,0.6196156394963552,407.25513347022587,555.6616822429907,0.0026507620941020544,418.36146179401993,0.9973492379058979,0.9758117958913187,0.558648111332008,16.7,16.7,3543.4666666666667,5.866666666666666,1584.2,3935.32,0.03333333333333333,3113.9655172413795,0.9666666666666667,0.9333333333333333,0.9,16.7,16.7,2644.3,8.2,1336.0,2905.96,0.03333333333333333,2183.7586206896553,0.9666666666666667,0.9333333333333333,0.7333333333333333,91.8,91.8,312.6459438968916,0.03335860500379075,292.8117258464079,535.0462962962963,0.000758150113722517,300.7443095599393,0.9992418498862775,0.9984836997725549,0.1599696739954511,79.2,79.2,966.608,0.984,678.979797979798,2061.8076923076924,0.006,875.8068410462777,0.994,0.988,0.514,43.6,43.6,1474.634074074074,1.288888888888889,1011.6360544217687,1831.9081364829397,0.016296296296296295,1233.936746987952,0.9837037037037037,0.9733333333333334,0.6696296296296296,49.7125,49.7125,1445.9852092104686,2.286382401477551,843.5123587614578,1831.1248971126734,0.015590602043407264,1225.4778125210285,0.9844093979565927,0.9712428193109254,0.5775174464774793
|
| 4 |
+
eval_results-global_step_20,36.8,36.8,749.3419117647059,0.2647058823529412,544.13,868.6511627906976,0.0,749.3419117647059,1.0,0.9963235294117647,0.45588235294117646,62.5,62.5,3341.225,11.925,2057.12,5481.4,0.025,3016.641025641026,0.975,0.925,0.85,72.1,72.1,622.5596421471173,0.936050364479788,561.6977491961414,780.1058263971463,0.0023194168323392977,586.8462304882099,0.9976805831676607,0.9804506295559974,0.6292246520874751,13.3,13.3,6496.2,11.333333333333334,1646.75,7242.2692307692305,0.2,4119.416666666667,0.8,0.7333333333333333,0.9666666666666667,23.3,23.3,4939.533333333334,30.4,3971.285714285714,5234.217391304348,0.03333333333333333,4558.068965517241,0.9666666666666667,0.8333333333333334,0.9333333333333333,93.5,93.5,381.9128127369219,0.0712661106899166,376.021897810219,466.3720930232558,0.0,381.9128127369219,1.0,0.9984836997725549,0.19787717968157695,81.4,81.4,1488.374,2.486,922.936117936118,3962.9247311827958,0.008,1370.9415322580646,0.992,0.98,0.586,48.3,48.3,2765.731851851852,6.060740740740741,1574.4110429447853,3878.541547277937,0.034074074074074076,2299.0153374233128,0.965925925925926,0.9525925925925925,0.8340740740740741,53.9,53.9,2598.109818979241,7.93463705394959,1456.7940652716222,3489.3102478431765,0.03784085302996834,2135.2730603120185,0.9621591469700317,0.924939639749947,0.681632282348038
|
| 5 |
+
eval_results-global_step_30,36.8,36.8,1043.0625,3.073529411764706,695.07,1245.3837209302326,0.007352941176470588,930.9962962962964,0.9926470588235294,0.9926470588235294,0.5661764705882353,70.0,70.0,3201.2,8.175,2237.4285714285716,5450.0,0.025,2873.25641025641,0.975,0.95,0.8,78.1,78.1,765.3389662027834,1.4532803180914513,657.6355536699194,1149.3888048411497,0.003644797879390325,709.4845360824743,0.9963552021206097,0.9804506295559974,0.6726308813783963,16.7,16.7,7676.766666666666,21.766666666666666,1997.4,8812.64,0.16666666666666666,6022.24,0.8333333333333334,0.7,0.9666666666666667,26.7,26.7,5636.333333333333,19.233333333333334,3237.25,6508.727272727273,0.06666666666666667,4902.642857142857,0.9333333333333333,0.8333333333333334,0.9666666666666667,91.7,91.7,464.3199393479909,0.18119787717968158,439.6575682382134,735.3818181818182,0.0,464.3199393479909,1.0,0.9969673995451099,0.26459438968915844,83.4,83.4,1690.934,2.582,1125.4604316546763,4531.9277108433735,0.01,1546.3030303030303,0.99,0.99,0.688,48.9,48.9,3462.4933333333333,10.435555555555556,1835.0363636363636,5019.191304347826,0.056296296296296296,2714.12715855573,0.9437037037037037,0.914074074074074,0.8622222222222222,56.537499999999994,56.537499999999994,2992.5560923605135,8.362570395323925,1528.117311078468,4181.580078983959,0.04195342108568632,2520.4212784980987,0.9580465789143137,0.9196840619165055,0.7233696621514182
|
| 6 |
+
eval_results-global_step_40,36.4,36.4,975.8639705882352,0.6286764705882353,745.4646464646464,1107.7109826589594,0.0,975.8639705882352,1.0,0.9963235294117647,0.5441176470588235,67.5,67.5,3282.125,8.0,1868.5555555555557,6218.0,0.025,2955.948717948718,0.975,0.95,0.8,79.5,79.5,786.0828363154407,1.2647448641484427,724.7739783152628,1023.2096774193549,0.0023194168323392977,750.388575224178,0.9976805831676607,0.9797879390324719,0.7024519549370444,23.3,23.3,7061.666666666667,25.966666666666665,2573.1428571428573,8427.739130434782,0.2,4827.333333333333,0.8,0.7333333333333333,1.0,20.0,20.0,6232.8,24.766666666666666,2844.5,7079.875,0.16666666666666666,4287.32,0.8333333333333334,0.8,0.9666666666666667,93.6,93.6,458.33965125094767,0.14935557240333586,449.0680713128039,592.9411764705883,0.0,458.33965125094767,1.0,0.9984836997725549,0.2918877937831691,84.6,84.6,1762.144,3.422,1090.9810874704492,5449.181818181818,0.012,1588.7105263157894,0.988,0.978,0.678,48.1,48.1,3415.6711111111113,8.457777777777778,1656.1076923076923,5049.551428571429,0.054814814814814816,2685.896551724138,0.9451851851851852,0.9214814814814815,0.8755555555555555,56.625,56.625,2996.83665449155,9.08198600228139,1494.0742360711586,4368.526151717117,0.0576001122892276,2316.2251657981674,0.9423998877107724,0.9196762478789507,0.7323349522501574
|
| 7 |
+
eval_results-global_step_50,36.8,36.8,1181.3419117647059,3.0955882352941178,845.37,1376.6744186046512,0.003676470588235294,1126.719557195572,0.9963235294117647,0.9816176470588235,0.5955882352941176,75.0,75.0,3588.575,9.35,1828.4333333333334,8869.0,0.0,3588.575,1.0,0.875,0.825,80.0,80.0,814.1610337972166,1.739562624254473,745.4556752278376,1088.7549668874171,0.0033134526176275677,763.7563164893617,0.9966865473823724,0.9774685222001326,0.7203445990722332,20.0,20.0,6118.866666666667,15.466666666666667,3677.6666666666665,6729.166666666667,0.13333333333333333,4597.615384615385,0.8666666666666667,0.8,0.9666666666666667,23.3,23.3,5266.633333333333,12.366666666666667,3300.0,5865.173913043478,0.03333333333333333,4897.0,0.9666666666666667,0.9,1.0,93.7,93.7,457.8339651250948,0.11827141774071266,448.87783171521033,591.2048192771084,0.0,457.8339651250948,1.0,0.9992418498862775,0.287338893100834,85.4,85.4,1685.964,3.682,1108.4496487119438,5064.027397260274,0.014,1482.6997971602434,0.986,0.974,0.666,51.0,51.0,3449.077037037037,8.59851851851852,1751.0029069767443,5213.842900302115,0.05333333333333334,2740.622848200313,0.9466666666666667,0.9318518518518518,0.8740740740740741,58.150000000000006,58.150000000000006,2820.3066184655063,6.802159266142645,1713.157007828967,4349.730635255213,0.03012374040073286,2456.8528585982463,0.969876259599267,0.9298974838746358,0.7418765585259908
|
| 8 |
+
eval_results-global_step_60,37.5,37.5,1045.2610294117646,0.6875,805.2352941176471,1189.2764705882353,0.0,1045.2610294117646,1.0,0.9963235294117647,0.6139705882352942,65.0,65.0,3065.25,5.85,1716.8076923076924,5569.5,0.025,2734.5128205128203,0.975,0.925,0.875,79.3,79.3,802.0414181577204,1.7978793903247183,720.0258980785296,1116.6971153846155,0.0023194168323392977,766.614413816008,0.9976805831676607,0.9797879390324719,0.7190192180251822,30.0,30.0,5753.466666666666,42.0,2277.4444444444443,7243.190476190476,0.1,4612.888888888889,0.9,0.8333333333333334,1.0,16.7,16.7,6363.6,29.966666666666665,1694.4,7297.44,0.1,5290.925925925926,0.9,0.8333333333333334,0.9,93.4,93.4,465.0614101592115,0.26914329037149354,437.96915584415586,848.7126436781609,0.000758150113722517,453.1449165402124,0.9992418498862775,0.9984836997725549,0.310841546626232,82.6,82.6,1628.972,2.358,1079.7772397094432,4236.068965517241,0.008,1513.1532258064517,0.992,0.984,0.68,50.2,50.2,3312.311111111111,9.139259259259259,1817.2625368731563,4820.708333333333,0.034074074074074076,2864.814417177914,0.965925925925926,0.9303703703703704,0.8711111111111111,56.8375,56.8375,2804.495454438309,11.508556075827768,1318.6152826718835,4040.199250586508,0.033768955127516986,2410.164454759998,0.966231044872483,0.9350790256567286,0.7462428079997275
|
| 9 |
+
eval_results-global_step_70,40.8,40.8,1070.0698529411766,0.6875,785.4504504504505,1266.2981366459628,0.0,1070.0698529411766,1.0,0.9926470588235294,0.5845588235294118,72.5,72.5,2952.05,6.55,1982.4137931034484,5508.363636363636,0.025,2617.6153846153848,0.975,0.975,0.85,78.2,78.2,819.4668654738238,1.6358515573227304,735.9610004239084,1118.3899848254932,0.003976143141153081,759.0133067198935,0.9960238568588469,0.9774685222001326,0.7077534791252486,20.0,20.0,5749.833333333333,19.566666666666666,1802.5,6736.666666666667,0.13333333333333333,4170.076923076923,0.8666666666666667,0.8,0.9666666666666667,20.0,20.0,6076.033333333334,26.633333333333333,2108.5,7067.916666666667,0.13333333333333333,4569.2692307692305,0.8666666666666667,0.7666666666666667,1.0,93.4,93.4,442.3563305534496,0.11448066717210008,435.9862012987013,532.5632183908046,0.0,442.3563305534496,1.0,0.9992418498862775,0.2699014404852161,84.6,84.6,1716.186,6.8,1041.2033096926714,5424.207792207792,0.018,1453.529531568228,0.982,0.974,0.662,50.1,50.1,3079.0755555555556,8.274074074074074,1694.792899408284,4467.46587537092,0.035555555555555556,2602.9001536098312,0.9644444444444444,0.9481481481481482,0.8696296296296296,57.45,57.45,2738.133908898834,8.782738287321113,1323.3509567971828,4015.233997142243,0.04364979567042192,2210.603839231765,0.9563502043295781,0.9291465307155944,0.7388137549295217
|
| 10 |
+
eval_results-global_step_80,36.4,36.4,1121.7132352941176,0.7757352941176471,831.2525252525253,1287.9306358381502,0.003676470588235294,1068.830258302583,0.9963235294117647,0.9963235294117647,0.5808823529411765,67.5,67.5,3409.775,6.65,2261.740740740741,5794.153846153846,0.0,3409.775,1.0,0.95,0.775,79.4,79.4,822.2004638833665,1.6693174287607688,724.3124739257405,1200.038647342995,0.003644797879390325,766.9015630196209,0.9963552021206097,0.9777998674618953,0.7239893969516236,30.0,30.0,6118.966666666666,16.4,2107.3333333333335,7838.238095238095,0.16666666666666666,4153.92,0.8333333333333334,0.8,1.0,20.0,20.0,6597.433333333333,38.6,4887.333333333333,7024.958333333333,0.1,5542.666666666667,0.9,0.7666666666666667,0.9333333333333333,92.6,92.6,475.15238817285825,0.16300227445034116,450.36497545008183,787.4226804123712,0.0,475.15238817285825,1.0,0.9984836997725549,0.2767247915087187,89.2,89.2,1553.866,2.588,1125.4035874439462,5092.648148148148,0.004,1495.8875502008032,0.996,0.984,0.684,50.8,50.8,3215.7614814814815,8.687407407407408,1759.9825072886297,4719.774096385542,0.037037037037037035,2723.2046153846154,0.9629629629629629,0.9392592592592592,0.8696296296296296,58.2375,58.2375,2914.3585711039777,9.44168280059202,1768.4654345960412,4218.14556035656,0.03937812152141617,2454.5422552183936,0.9606218784785839,0.9265666278215176,0.7304449380455603
|
| 11 |
+
eval_results-global_step_90,40.8,40.8,1174.1066176470588,1.411764705882353,899.7207207207207,1363.27950310559,0.0,1174.1066176470588,1.0,0.9852941176470589,0.6102941176470589,70.0,70.0,3502.025,9.4,1884.3214285714287,7276.666666666667,0.025,3180.3076923076924,0.975,0.925,0.875,79.1,79.1,839.7773359840954,1.7491716368455932,740.828655215752,1214.0887480190174,0.003644797879390325,784.0941137346192,0.9963552021206097,0.9777998674618953,0.7213386348575216,26.7,26.7,5584.533333333334,13.833333333333334,3066.0,6500.363636363636,0.03333333333333333,5225.3448275862065,0.9666666666666667,0.9333333333333333,0.9666666666666667,16.7,16.7,3947.766666666667,10.033333333333333,2029.2,4331.48,0.0,3947.766666666667,1.0,0.9333333333333333,0.9666666666666667,93.8,93.8,485.72934040940106,0.3388931008339651,461.7259498787389,847.829268292683,0.000758150113722517,473.48406676783003,0.9992418498862775,0.9984836997725549,0.3055344958301744,86.2,86.2,1557.568,2.224,1124.4037122969837,4263.275362318841,0.004,1499.694779116466,0.996,0.982,0.668,51.1,51.1,3165.311111111111,8.044444444444444,1808.431884057971,4583.866666666667,0.03851851851851852,2651.697996918336,0.9614814814814815,0.9466666666666667,0.8681481481481481,58.05,58.05,2532.102175643958,5.879367569334128,1501.8290438426995,3797.6062314291376,0.013156849980620588,2367.0620950931097,0.9868431500193795,0.9602388772768553,0.7477060912270295
|
| 12 |
+
eval_results-global_step_100,37.9,37.9,1185.8088235294117,2.9595588235294117,841.0970873786408,1395.8994082840236,0.007352941176470588,1076.9592592592592,0.9926470588235294,0.9852941176470589,0.5698529411764706,67.5,67.5,3563.375,14.575,1730.3703703703704,7370.384615384615,0.05,2908.5789473684213,0.95,0.925,0.925,77.6,77.6,821.2372432074221,1.5374420145791916,759.898804440649,1033.7440828402366,0.0023194168323392977,786.0315509797409,0.9976805831676607,0.9781312127236581,0.7256461232604374,13.3,13.3,6191.033333333334,12.3,1540.5,6906.5,0.1,5103.148148148148,0.9,0.8,1.0,26.7,26.7,5226.666666666667,14.4,2493.875,6220.409090909091,0.06666666666666667,4462.285714285715,0.9333333333333333,0.8666666666666667,0.9333333333333333,93.6,93.6,488.1925701288855,0.2850644427596664,463.0,853.9294117647058,0.0,488.1925701288855,1.0,0.9977255496588324,0.3062926459438969,86.0,86.0,1495.094,2.358,1095.7116279069767,3948.442857142857,0.002,1466.0901803607214,0.998,0.988,0.664,50.4,50.4,3217.1451851851853,8.002962962962963,1699.479411764706,4757.462686567164,0.03259259259259259,2785.924961715161,0.9674074074074074,0.9392592592592592,0.8770370370370371,56.625,56.625,2773.5691027563635,7.052253530478904,1327.9915377326677,4060.8465191115865,0.03261645215850864,2384.6514165307563,0.9673835478414913,0.9350096007444344,0.7501452600938968
|
| 13 |
+
eval_results-global_step_110,39.7,39.7,1113.4375,0.9852941176470589,888.6018518518518,1261.5,0.0,1113.4375,1.0,0.9963235294117647,0.5845588235294118,70.0,70.0,2832.05,4.7,1878.4642857142858,5057.083333333333,0.0,2832.05,1.0,1.0,0.9,78.4,78.4,822.2369118621604,1.6494367130550034,729.3629066328687,1159.921658986175,0.0023194168323392977,786.971438060445,0.9976805831676607,0.9777998674618953,0.7292909211398277,26.7,26.7,5703.466666666666,23.133333333333333,1825.75,7113.545454545455,0.06666666666666667,4974.607142857143,0.9333333333333333,0.9333333333333333,0.9666666666666667,20.0,20.0,4268.366666666667,16.766666666666666,2227.8333333333335,4778.5,0.03333333333333333,3860.0,0.9666666666666667,0.9,0.9333333333333333,93.6,93.6,487.8097043214556,0.23805913570887036,473.3962722852512,697.0588235294117,0.0,487.8097043214556,1.0,0.9984836997725549,0.3062926459438969,85.6,85.6,1564.606,2.298,1127.6004672897195,4162.361111111111,0.008,1448.1935483870968,0.992,0.986,0.668,52.0,52.0,2840.131851851852,6.022222222222222,1645.5982905982905,4134.20987654321,0.013333333333333334,2662.214714714715,0.9866666666666667,0.9688888888888889,0.8785185185185185,58.25,58.25,2454.0131626711004,6.9741265235791445,1349.5759259632,3545.5225322560873,0.015456593770709081,2270.660506042607,0.9845434062292909,0.9701036648585546,0.7458326136414569
|
| 14 |
+
eval_results-global_step_120,43.0,43.0,1296.889705882353,1.8235294117647058,864.6837606837607,1623.1354838709678,0.007352941176470588,1187.5777777777778,0.9926470588235294,0.9742647058823529,0.6286764705882353,67.5,67.5,2712.075,5.35,1855.888888888889,4490.307692307692,0.025,2371.128205128205,0.975,0.975,0.875,77.5,77.5,818.4774685222002,1.634857521537442,728.316509837468,1128.4720588235293,0.0026507620941020544,778.1395348837209,0.9973492379058979,0.9768058316766071,0.7226640159045725,20.0,20.0,5528.6,14.6,1503.1666666666667,6534.958333333333,0.1,4365.7037037037035,0.9,0.8333333333333334,1.0,13.3,13.3,4524.0,15.4,1559.25,4980.115384615385,0.0,4524.0,1.0,0.9333333333333333,0.9666666666666667,93.9,93.9,529.7763457164518,0.5375284306292646,489.9668820678514,1138.2222222222222,0.001516300227445034,506.18602885345484,0.9984836997725549,0.9977255496588324,0.3252463987869598,86.0,86.0,1580.476,2.846,1108.6953488372094,4478.557142857143,0.006,1493.486921529175,0.994,0.988,0.644,52.9,52.9,2823.6192592592593,5.832592592592593,1662.2941176470588,4127.37106918239,0.025185185185185185,2482.6854103343467,0.9748148148148148,0.96,0.8548148148148148,56.7625,56.7625,2476.7392224225327,6.003063494565501,1221.532771828613,3562.642423401583,0.02096314858540036,2213.613447776298,0.9790368514145996,0.9548078442355573,0.7521335458451561
|
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 52.5,
|
| 7 |
+
"pass_acc": 52.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 52.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 208.48091626167297,
|
| 12 |
+
"time_use_in_minite": "3:28"
|
| 13 |
+
}
|
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 32.7,
|
| 7 |
+
"pass_acc": 32.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 32.7
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 52.1,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 42.3,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 20.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 66.7,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 22.6,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 18.2,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 66.7,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 18.2
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 245.56918740272522,
|
| 23 |
+
"time_use_in_minite": "4:05"
|
| 24 |
+
}
|
eval_results_ood/global_step_0/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/2025-07-14T12-41-17.800843/details_lighteval|gpqa:diamond|0_2025-07-14T12-41-17.800843.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_0/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/2025-07-17T15-29-55.092392/details_extended|ifeval|0_2025-07-17T15-29-55.092392.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-14T12-41-17.800843.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915100.644070905,
|
| 8 |
+
"end_time": 14915280.850102285,
|
| 9 |
+
"total_evaluation_time_secondes": "180.20603138022125",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.29797979797979796,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.03258630383836556
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.29797979797979796,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.03258630383836556
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "27208ea99069c1be"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "344a39124c320da2"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-17T15-29-55.092392.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.142970905,
|
| 8 |
+
"end_time": 15184598.88415676,
|
| 9 |
+
"total_evaluation_time_secondes": "1861.7411858551204",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.34935304990757854,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.020516727431440555,
|
| 36 |
+
"inst_level_strict_acc": 0.4724220623501199,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005228062932701595,
|
| 38 |
+
"prompt_level_loose_acc": 0.41589648798521256,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021209999356818787,
|
| 40 |
+
"inst_level_loose_acc": 0.5407673860911271,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005363627388437415
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.34935304990757854,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.020516727431440555,
|
| 46 |
+
"inst_level_strict_acc": 0.4724220623501199,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005228062932701595,
|
| 48 |
+
"prompt_level_loose_acc": 0.41589648798521256,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021209999356818787,
|
| 50 |
+
"inst_level_loose_acc": 0.5407673860911271,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005363627388437415
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "b3cb23aab1952677"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "d5aa5eb24a1035c5"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_0/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface/results_2025-07-17T21-16-09.926343.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15204249.684829334,
|
| 8 |
+
"end_time": 15205382.378836963,
|
| 9 |
+
"total_evaluation_time_secondes": "1132.6940076295286",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_0_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.11567164179104478,
|
| 35 |
+
"codegen_pass@1_stderr": 0.019573324455715798
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.11567164179104478,
|
| 39 |
+
"codegen_pass@1_stderr": 0.019573324455715798
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "81c548b3f6690f8e"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "3af106af7c53c054"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_10/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/2025-07-14T12-45-55.307385/details_lighteval|gpqa:diamond|0_2025-07-14T12-45-55.307385.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_10/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/2025-07-17T15-27-34.486497/details_extended|ifeval|0_2025-07-17T15-27-34.486497.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-14T12-45-55.307385.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915100.012619054,
|
| 8 |
+
"end_time": 14915558.355801448,
|
| 9 |
+
"total_evaluation_time_secondes": "458.34318239428103",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.35858585858585856,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.03416903640391521
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.35858585858585856,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.03416903640391521
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "89bca3b6aa1ee5d9"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "10ce096bb49d767c"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-17T15-27-34.486497.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.1428907,
|
| 8 |
+
"end_time": 15184458.310232582,
|
| 9 |
+
"total_evaluation_time_secondes": "1721.167341882363",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.3659889094269871,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.02072934168450661,
|
| 36 |
+
"inst_level_strict_acc": 0.47961630695443647,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005438062494574107,
|
| 38 |
+
"prompt_level_loose_acc": 0.43807763401109057,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021350931135490938,
|
| 40 |
+
"inst_level_loose_acc": 0.5515587529976019,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005520969995619648
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.3659889094269871,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.02072934168450661,
|
| 46 |
+
"inst_level_strict_acc": 0.47961630695443647,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005438062494574107,
|
| 48 |
+
"prompt_level_loose_acc": 0.43807763401109057,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021350931135490938,
|
| 50 |
+
"inst_level_loose_acc": 0.5515587529976019,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005520969995619648
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "ca9a743a9c3e5ef7"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "253ed58f2119b003"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_10/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface/results_2025-07-17T21-12-43.036251.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15204249.71116626,
|
| 8 |
+
"end_time": 15205175.496774988,
|
| 9 |
+
"total_evaluation_time_secondes": "925.7856087274849",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_10_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.1455223880597015,
|
| 35 |
+
"codegen_pass@1_stderr": 0.021580406447657102
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.1455223880597015,
|
| 39 |
+
"codegen_pass@1_stderr": 0.021580406447657102
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "6c2ab9e47b593a1e"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "acebbf2f01331140"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_100/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/2025-07-14T12-58-50.192486/details_lighteval|gpqa:diamond|0_2025-07-14T12-58-50.192486.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_100/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/2025-07-17T15-57-23.838956/details_extended|ifeval|0_2025-07-17T15-57-23.838956.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-14T12-58-50.192486.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14916139.10446176,
|
| 8 |
+
"end_time": 14916333.240437629,
|
| 9 |
+
"total_evaluation_time_secondes": "194.1359758693725",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.36363636363636365,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.034273086529999344
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.36363636363636365,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.034273086529999344
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "4b4a4a84a983d5d5"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "10ce564f8ee4aac4"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-17T15-57-23.838956.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15184832.168388324,
|
| 8 |
+
"end_time": 15186250.766905606,
|
| 9 |
+
"total_evaluation_time_secondes": "1418.5985172819346",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.3974121996303142,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.02105881284751987,
|
| 36 |
+
"inst_level_strict_acc": 0.5143884892086331,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005463073651039658,
|
| 38 |
+
"prompt_level_loose_acc": 0.49353049907578556,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021514772941787063,
|
| 40 |
+
"inst_level_loose_acc": 0.6019184652278178,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.000559038284798999
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.3974121996303142,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.02105881284751987,
|
| 46 |
+
"inst_level_strict_acc": 0.5143884892086331,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005463073651039658,
|
| 48 |
+
"prompt_level_loose_acc": 0.49353049907578556,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021514772941787063,
|
| 50 |
+
"inst_level_loose_acc": 0.6019184652278178,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.000559038284798999
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "48870004bffd9c2b"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "11e76e7eb9fae1cc"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_100/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface/results_2025-07-17T22-59-45.311675.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15210627.99526629,
|
| 8 |
+
"end_time": 15211597.771527236,
|
| 9 |
+
"total_evaluation_time_secondes": "969.776260945946",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_100_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.10074626865671642,
|
| 35 |
+
"codegen_pass@1_stderr": 0.01842043910472088
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.10074626865671642,
|
| 39 |
+
"codegen_pass@1_stderr": 0.01842043910472088
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "32041371c6c095c7"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "7ae1fd435640d5ba"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_110/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/2025-07-14T13-00-52.252218/details_lighteval|gpqa:diamond|0_2025-07-14T13-00-52.252218.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_110/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/2025-07-17T15-52-51.610154/details_extended|ifeval|0_2025-07-17T15-52-51.610154.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-14T13-00-52.252218.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14916138.99756243,
|
| 8 |
+
"end_time": 14916455.300147096,
|
| 9 |
+
"total_evaluation_time_secondes": "316.30258466489613",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.41919191919191917,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.035155207286704175
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "16095b20f4ee9b55"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "b8a34e90b32493f0"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-17T15-52-51.610154.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15184832.125134952,
|
| 8 |
+
"end_time": 15185978.792165717,
|
| 9 |
+
"total_evaluation_time_secondes": "1146.6670307647437",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.4011090573012939,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.021091536895520742,
|
| 36 |
+
"inst_level_strict_acc": 0.5023980815347722,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005544657695916986,
|
| 38 |
+
"prompt_level_loose_acc": 0.4731977818853974,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021485638661937954,
|
| 40 |
+
"inst_level_loose_acc": 0.5731414868105515,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005625621613331442
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.4011090573012939,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.021091536895520742,
|
| 46 |
+
"inst_level_strict_acc": 0.5023980815347722,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005544657695916986,
|
| 48 |
+
"prompt_level_loose_acc": 0.4731977818853974,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021485638661937954,
|
| 50 |
+
"inst_level_loose_acc": 0.5731414868105515,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005625621613331442
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "db3e19d6cdc393d4"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "a85e49ae8fdfe5e1"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_110/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface/results_2025-07-17T22-57-39.607069.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15210627.974279799,
|
| 8 |
+
"end_time": 15211472.069707442,
|
| 9 |
+
"total_evaluation_time_secondes": "844.0954276435077",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_110_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.1417910447761194,
|
| 35 |
+
"codegen_pass@1_stderr": 0.021348398039823163
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.1417910447761194,
|
| 39 |
+
"codegen_pass@1_stderr": 0.021348398039823163
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "1311194d12a741fc"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "a34786c13ba46897"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_120/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/2025-07-14T13-13-03.024786/details_lighteval|gpqa:diamond|0_2025-07-14T13-13-03.024786.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_120/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/2025-07-17T16-25-43.731519/details_extended|ifeval|0_2025-07-17T16-25-43.731519.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-14T13-13-03.024786.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14916704.352372045,
|
| 8 |
+
"end_time": 14917186.072035735,
|
| 9 |
+
"total_evaluation_time_secondes": "481.71966369077563",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.4494949494949495,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.0354413249194797
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.4494949494949495,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.0354413249194797
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "b3d7fbf204a70934"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "53a6f0cd8c8ba039"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-17T16-25-43.731519.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15186738.095798789,
|
| 8 |
+
"end_time": 15187950.951515423,
|
| 9 |
+
"total_evaluation_time_secondes": "1212.8557166345417",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.41404805914972276,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.021196272552471213,
|
| 36 |
+
"inst_level_strict_acc": 0.5191846522781774,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005685205483545439,
|
| 38 |
+
"prompt_level_loose_acc": 0.4787430683918669,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021497120515987733,
|
| 40 |
+
"inst_level_loose_acc": 0.5851318944844125,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005563399403814487
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.41404805914972276,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.021196272552471213,
|
| 46 |
+
"inst_level_strict_acc": 0.5191846522781774,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005685205483545439,
|
| 48 |
+
"prompt_level_loose_acc": 0.4787430683918669,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021497120515987733,
|
| 50 |
+
"inst_level_loose_acc": 0.5851318944844125,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005563399403814487
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "bb77cf134de2bf79"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "d17221e69ebba9e2"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_120/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface/results_2025-07-17T23-21-18.939034.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15211628.398449313,
|
| 8 |
+
"end_time": 15212891.402012471,
|
| 9 |
+
"total_evaluation_time_secondes": "1263.003563158214",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_120_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.1417910447761194,
|
| 35 |
+
"codegen_pass@1_stderr": 0.021348398039823163
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.1417910447761194,
|
| 39 |
+
"codegen_pass@1_stderr": 0.021348398039823163
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "117860a7ff3c8feb"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "b7925dc53faf51ad"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_20/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/2025-07-14T12-46-08.302506/details_lighteval|gpqa:diamond|0_2025-07-14T12-46-08.302506.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_20/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/2025-07-17T15-33-20.611749/details_extended|ifeval|0_2025-07-17T15-33-20.611749.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-14T12-46-08.302506.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915100.640499769,
|
| 8 |
+
"end_time": 14915571.350168671,
|
| 9 |
+
"total_evaluation_time_secondes": "470.7096689026803",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.37373737373737376,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.034468977386593325
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.37373737373737376,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.034468977386593325
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "6b07c5212d8b2448"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "c44ee54e365de48c"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-17T15-33-20.611749.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.142951412,
|
| 8 |
+
"end_time": 15184802.190311132,
|
| 9 |
+
"total_evaluation_time_secondes": "2065.0473597198725",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.3733826247689464,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.020815238376834508,
|
| 36 |
+
"inst_level_strict_acc": 0.49280575539568344,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005355924227560122,
|
| 38 |
+
"prompt_level_loose_acc": 0.44731977818853974,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021396815020425963,
|
| 40 |
+
"inst_level_loose_acc": 0.5563549160671463,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005695230844856723
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.3733826247689464,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.020815238376834508,
|
| 46 |
+
"inst_level_strict_acc": 0.49280575539568344,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005355924227560122,
|
| 48 |
+
"prompt_level_loose_acc": 0.44731977818853974,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021396815020425963,
|
| 50 |
+
"inst_level_loose_acc": 0.5563549160671463,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005695230844856723
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "cd9a6b34fc82e0d2"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "72cc1aa5d33f61b1"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_20/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface/results_2025-07-17T21-33-30.966895.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15205415.356863018,
|
| 8 |
+
"end_time": 15206423.42765456,
|
| 9 |
+
"total_evaluation_time_secondes": "1008.0707915425301",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_20_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.13059701492537312,
|
| 35 |
+
"codegen_pass@1_stderr": 0.02062156198724618
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.13059701492537312,
|
| 39 |
+
"codegen_pass@1_stderr": 0.02062156198724618
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "d35ed2c195462e63"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "cac125ba5766faa2"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_30/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/2025-07-14T12-41-43.868511/details_lighteval|gpqa:diamond|0_2025-07-14T12-41-43.868511.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_30/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/2025-07-17T15-22-22.924052/details_extended|ifeval|0_2025-07-17T15-22-22.924052.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-14T12-41-43.868511.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915100.525758365,
|
| 8 |
+
"end_time": 14915306.916785764,
|
| 9 |
+
"total_evaluation_time_secondes": "206.39102739840746",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.035212249088415824
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.42424242424242425,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.035212249088415824
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "5fca5cdc8952cf48"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "29f6c622ec0e40ae"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-17T15-22-22.924052.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.142965583,
|
| 8 |
+
"end_time": 15184146.64612279,
|
| 9 |
+
"total_evaluation_time_secondes": "1409.5031572077423",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.36968576709796674,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.0207729436163323,
|
| 36 |
+
"inst_level_strict_acc": 0.4904076738609113,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005529038106109171,
|
| 38 |
+
"prompt_level_loose_acc": 0.4288354898336414,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.02129752256905075,
|
| 40 |
+
"inst_level_loose_acc": 0.5503597122302158,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005446669740365313
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.36968576709796674,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.0207729436163323,
|
| 46 |
+
"inst_level_strict_acc": 0.4904076738609113,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005529038106109171,
|
| 48 |
+
"prompt_level_loose_acc": 0.4288354898336414,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.02129752256905075,
|
| 50 |
+
"inst_level_loose_acc": 0.5503597122302158,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005446669740365313
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "bf3fca5a3af16436"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "d30b49dd80e054a0"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_30/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface/results_2025-07-17T21-33-36.117826.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15205415.356977923,
|
| 8 |
+
"end_time": 15206428.57492127,
|
| 9 |
+
"total_evaluation_time_secondes": "1013.2179433479905",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_30_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.13059701492537312,
|
| 35 |
+
"codegen_pass@1_stderr": 0.020621561987246184
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.13059701492537312,
|
| 39 |
+
"codegen_pass@1_stderr": 0.020621561987246184
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "75f3f0f1393644bd"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "0f5e42c64b80a27a"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_40/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/2025-07-14T12-49-47.596528/details_lighteval|gpqa:diamond|0_2025-07-14T12-49-47.596528.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_40/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/2025-07-17T15-23-49.905332/details_extended|ifeval|0_2025-07-17T15-23-49.905332.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-14T12-49-47.596528.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915581.460954065,
|
| 8 |
+
"end_time": 14915790.644901186,
|
| 9 |
+
"total_evaluation_time_secondes": "209.1839471217245",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.40404040404040403,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.03496130972056127
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.40404040404040403,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.03496130972056127
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "34d5345add45acc2"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "a5ed185afee461f7"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-17T15-23-49.905332.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.143004768,
|
| 8 |
+
"end_time": 15184235.135938007,
|
| 9 |
+
"total_evaluation_time_secondes": "1497.9929332397878",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.38632162661737524,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.020953088140869384,
|
| 36 |
+
"inst_level_strict_acc": 0.4952038369304556,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005569695302516132,
|
| 38 |
+
"prompt_level_loose_acc": 0.45471349353049906,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.02142813710693671,
|
| 40 |
+
"inst_level_loose_acc": 0.5695443645083933,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005477311584246324
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.38632162661737524,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.020953088140869384,
|
| 46 |
+
"inst_level_strict_acc": 0.4952038369304556,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005569695302516132,
|
| 48 |
+
"prompt_level_loose_acc": 0.45471349353049906,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.02142813710693671,
|
| 50 |
+
"inst_level_loose_acc": 0.5695443645083933,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005477311584246324
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "465c920924784241"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "5a10ce5273d76a99"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_40/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface/results_2025-07-17T21-52-17.853514.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15206463.848549245,
|
| 8 |
+
"end_time": 15207550.294577831,
|
| 9 |
+
"total_evaluation_time_secondes": "1086.446028586477",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_40_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.11567164179104478,
|
| 35 |
+
"codegen_pass@1_stderr": 0.01957332445571579
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.11567164179104478,
|
| 39 |
+
"codegen_pass@1_stderr": 0.01957332445571579
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "63f2e7f4661c2dc9"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "25868654ff56d82d"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_50/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/2025-07-14T12-55-00.091779/details_lighteval|gpqa:diamond|0_2025-07-14T12-55-00.091779.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_50/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/2025-07-17T15-24-59.219716/details_extended|ifeval|0_2025-07-17T15-24-59.219716.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-14T12-55-00.091779.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 14915581.71616168,
|
| 8 |
+
"end_time": 14916103.138410663,
|
| 9 |
+
"total_evaluation_time_secondes": "521.4222489837557",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"lighteval|gpqa:diamond|0": {
|
| 34 |
+
"gpqa_pass@1:1_samples": 0.35858585858585856,
|
| 35 |
+
"gpqa_pass@1:1_samples_stderr": 0.03416903640391521
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"gpqa_pass@1:1_samples": 0.35858585858585856,
|
| 39 |
+
"gpqa_pass@1:1_samples_stderr": 0.03416903640391521
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"lighteval|gpqa:diamond|0": 1
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"lighteval|gpqa:diamond": {
|
| 47 |
+
"name": "gpqa:diamond",
|
| 48 |
+
"prompt_function": "gpqa_instruct",
|
| 49 |
+
"hf_repo": "Idavidrein/gpqa",
|
| 50 |
+
"hf_subset": "gpqa_diamond",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "gpqa_pass@1:1_samples",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "compute",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"train"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"train"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"lighteval"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 198,
|
| 80 |
+
"effective_num_docs": 198,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 1
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"lighteval|gpqa:diamond|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "50ecb6f5d091bd95",
|
| 89 |
+
"hash_full_prompts": "1b19c7f64e1e9b2a",
|
| 90 |
+
"hash_input_tokens": "864f299da9b1369e",
|
| 91 |
+
"hash_cont_tokens": "68d2fae6aa44585f"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 198,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 198,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "a9318dbdd867770b",
|
| 104 |
+
"hash_full_prompts": "eeb532dc3dbd3bac",
|
| 105 |
+
"hash_input_tokens": "36c6d3fad6c1cb8a",
|
| 106 |
+
"hash_cont_tokens": "897487148ea9cc19"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 198,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 198,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-17T15-24-59.219716.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15182737.142973073,
|
| 8 |
+
"end_time": 15184304.211107643,
|
| 9 |
+
"total_evaluation_time_secondes": "1567.0681345704943",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.4048059149722736,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.02112301212105356,
|
| 36 |
+
"inst_level_strict_acc": 0.5191846522781774,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005392596754638425,
|
| 38 |
+
"prompt_level_loose_acc": 0.4731977818853974,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.021485638661937957,
|
| 40 |
+
"inst_level_loose_acc": 0.5839328537170264,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005423750292432568
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.4048059149722736,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.02112301212105356,
|
| 46 |
+
"inst_level_strict_acc": 0.5191846522781774,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005392596754638425,
|
| 48 |
+
"prompt_level_loose_acc": 0.4731977818853974,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.021485638661937957,
|
| 50 |
+
"inst_level_loose_acc": 0.5839328537170264,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005423750292432568
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "ed9d72adaba626e0"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "786a26d17129c170"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_50/results/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface/results_2025-07-17T22-00-10.015386.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 15206463.8485616,
|
| 8 |
+
"end_time": 15208022.4764252,
|
| 9 |
+
"total_evaluation_time_secondes": "1558.62786360085",
|
| 10 |
+
"model_name": "_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_50_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|lcb:codegeneration|0": {
|
| 34 |
+
"codegen_pass@1": 0.11940298507462686,
|
| 35 |
+
"codegen_pass@1_stderr": 0.019844518505590138
|
| 36 |
+
},
|
| 37 |
+
"all": {
|
| 38 |
+
"codegen_pass@1": 0.11940298507462686,
|
| 39 |
+
"codegen_pass@1_stderr": 0.019844518505590138
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"versions": {
|
| 43 |
+
"extended|lcb:codegeneration|0": 0
|
| 44 |
+
},
|
| 45 |
+
"config_tasks": {
|
| 46 |
+
"extended|lcb:codegeneration": {
|
| 47 |
+
"name": "lcb:codegeneration",
|
| 48 |
+
"prompt_function": "lcb_codegeneration_prompt_fn",
|
| 49 |
+
"hf_repo": "livecodebench/code_generation_lite",
|
| 50 |
+
"hf_subset": "v4_v5",
|
| 51 |
+
"metric": [
|
| 52 |
+
{
|
| 53 |
+
"metric_name": "codegen_pass@1",
|
| 54 |
+
"higher_is_better": true,
|
| 55 |
+
"category": "5",
|
| 56 |
+
"use_case": "6",
|
| 57 |
+
"sample_level_fn": "codegen_metric",
|
| 58 |
+
"corpus_level_fn": "mean"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"hf_revision": null,
|
| 62 |
+
"hf_filter": null,
|
| 63 |
+
"hf_avail_splits": [
|
| 64 |
+
"test"
|
| 65 |
+
],
|
| 66 |
+
"trust_dataset": true,
|
| 67 |
+
"evaluation_splits": [
|
| 68 |
+
"test"
|
| 69 |
+
],
|
| 70 |
+
"few_shots_split": null,
|
| 71 |
+
"few_shots_select": null,
|
| 72 |
+
"generation_size": 32768,
|
| 73 |
+
"generation_grammar": null,
|
| 74 |
+
"stop_sequence": [],
|
| 75 |
+
"num_samples": null,
|
| 76 |
+
"suite": [
|
| 77 |
+
"extended"
|
| 78 |
+
],
|
| 79 |
+
"original_num_docs": 268,
|
| 80 |
+
"effective_num_docs": 268,
|
| 81 |
+
"must_remove_duplicate_docs": false,
|
| 82 |
+
"version": 0
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"summary_tasks": {
|
| 86 |
+
"extended|lcb:codegeneration|0": {
|
| 87 |
+
"hashes": {
|
| 88 |
+
"hash_examples": "8db44696993f602e",
|
| 89 |
+
"hash_full_prompts": "088c68bb5b0ac575",
|
| 90 |
+
"hash_input_tokens": "886dbed8aff2cbbd",
|
| 91 |
+
"hash_cont_tokens": "ba7a410d84a1d19d"
|
| 92 |
+
},
|
| 93 |
+
"truncated": 0,
|
| 94 |
+
"non_truncated": 268,
|
| 95 |
+
"padded": 0,
|
| 96 |
+
"non_padded": 268,
|
| 97 |
+
"effective_few_shots": 0.0,
|
| 98 |
+
"num_truncated_few_shots": 0
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"summary_general": {
|
| 102 |
+
"hashes": {
|
| 103 |
+
"hash_examples": "cc556c609442b994",
|
| 104 |
+
"hash_full_prompts": "98e4b3639be2040d",
|
| 105 |
+
"hash_input_tokens": "a989c3816b4cf3b6",
|
| 106 |
+
"hash_cont_tokens": "61b1862ccc5963b5"
|
| 107 |
+
},
|
| 108 |
+
"truncated": 0,
|
| 109 |
+
"non_truncated": 268,
|
| 110 |
+
"padded": 0,
|
| 111 |
+
"non_padded": 268,
|
| 112 |
+
"num_truncated_few_shots": 0
|
| 113 |
+
}
|
| 114 |
+
}
|
eval_results_ood/global_step_60/details/_home_work_minzijun_rl_output_checkpoints_verl_0.4_grpo_simplelr_math_35_train_Qwen3-8B-base_max_response4096_batch1024_rollout8_vllm_global_step_60_actor_huggingface/2025-07-14T12-55-24.416815/details_lighteval|gpqa:diamond|0_2025-07-14T12-55-24.416815.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|