bensondccnqwc's picture
Add files using upload-large-folder tool
dfe5215 verified
exp_name,global_step,model_name,json_relpath,prompt_level_strict_acc,prompt_level_strict_acc_stderr,inst_level_strict_acc,inst_level_strict_acc_stderr,prompt_level_loose_acc,prompt_level_loose_acc_stderr,inst_level_loose_acc,inst_level_loose_acc_stderr,gpqa_pass@1:1_samples,gpqa_pass@1:1_samples_stderr
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,0,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_0_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_0_actor_huggingface/results_2025-09-01T20-02-30.740371.json,0.20517560073937152,0.017378071196759645,0.31534772182254195,0.0004911737949630811,0.21811460258780038,0.01777121177665385,0.3369304556354916,0.0005025546433398996,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,10,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_10_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_10_actor_huggingface/results_2025-09-01T19-54-33.485582.json,0.2199630314232902,0.0178252471922171,0.3489208633093525,0.0005098898148481087,0.23844731977818853,0.018337888094243953,0.3669064748201439,0.0005241203923238652,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,20,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_20_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_20_actor_huggingface/results_2025-09-01T19-30-43.054478.json,0.2587800369685767,0.018846992560712535,0.37290167865707435,0.0004830390823875894,0.2975970425138632,0.01967481200441347,0.4136690647482014,0.0005103631365089632,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,30,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_30_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_30_actor_huggingface/results_2025-09-01T19-15-20.431242.json,0.2920517560073937,0.019567429846009426,0.4040767386091127,0.0005223508064195623,0.36229205175600737,0.020684424314965394,0.46882494004796166,0.0005490123525754327,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,40,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_40_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_40_actor_huggingface/results_2025-09-01T19-12-17.431962.json,0.27726432532347506,0.019263706963479364,0.41247002398081534,0.0005103258295347225,0.3364140480591497,0.020332406004701275,0.46642685851318944,0.0005282899092217354,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,50,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_50_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_50_actor_huggingface/results_2025-09-01T19-12-18.935802.json,0.25693160813308685,0.018802962575636847,0.3872901678657074,0.0005108233092247319,0.3197781885397412,0.020070251556580192,0.45083932853717024,0.0005299013229451437,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,60,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_60_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_60/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_60_actor_huggingface/results_2025-09-01T20-10-41.425792.json,0.26062846580406657,0.018890584986760287,0.38848920863309355,0.0005068723629754001,0.3234750462107209,0.02013100339211897,0.45083932853717024,0.0005303966782759128,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,70,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_70_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_70/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_70_actor_huggingface/results_2025-09-01T20-10-39.227507.json,0.25508317929759705,0.018758491950414135,0.3752997601918465,0.0005067240670060003,0.3123844731977819,0.019944386293758908,0.43285371702637887,0.0005244145838037965,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,80,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_80_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_80/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_80_actor_huggingface/results_2025-09-01T20-10-40.131826.json,0.22365988909426987,0.017931771054658378,0.35611510791366907,0.000493985540441849,0.2735674676524954,0.019183727107392815,0.4136690647482014,0.0005017462545450149,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,90,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_90_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_90/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_90_actor_huggingface/results_2025-09-01T20-10-38.728081.json,0.24029574861367836,0.01838647358148708,0.35251798561151076,0.0005101968226879204,0.2920517560073937,0.019567429846009433,0.420863309352518,0.0005185049553865136,,
verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all,100,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_100_actor_huggingface,verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all/eval_results_ood/global_step_100/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_grpo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.5_random-reuse-baseline-all_global_step_100_actor_huggingface/results_2025-09-01T20-10-41.932006.json,0.22181146025878004,0.017878765407944374,0.3381294964028777,0.0005062901498414205,0.2754158964879852,0.01922392319624202,0.39688249400479614,0.0005268136750827436,,