bensondccnqwc commited on
Commit
457d1e5
·
verified ·
1 Parent(s): 2c223d2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results_ood/global_step_0/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/2025-08-29T11-17-46.747738/details_extended|ifeval|0_2025-08-29T11-17-46.747738.csv +0 -0
  2. eval_results_ood/global_step_0/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/2025-09-23T15-20-09.172749/details_extended|ifeval|0_2025-09-23T15-20-09.172749.csv +0 -0
  3. eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-29T11-17-46.747738.json +141 -0
  4. eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-09-23T15-20-09.172749.json +141 -0
  5. eval_results_ood/global_step_10/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/2025-09-23T15-19-07.845971/details_extended|ifeval|0_2025-09-23T15-19-07.845971.csv +0 -0
  6. eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-09-23T15-19-07.845971.json +141 -0
  7. eval_results_ood/global_step_15/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/2025-08-29T11-16-24.695422/details_extended|ifeval|0_2025-08-29T11-16-24.695422.csv +0 -0
  8. eval_results_ood/global_step_15/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/2025-09-23T15-18-58.766285/details_extended|ifeval|0_2025-09-23T15-18-58.766285.csv +0 -0
  9. eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-29T11-16-24.695422.json +141 -0
  10. eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-09-23T15-18-58.766285.json +141 -0
  11. eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-29T11-16-25.542329/details_extended|ifeval|0_2025-08-29T11-16-25.542329.csv +0 -0
  12. eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-09-23T15-20-14.009200/details_extended|ifeval|0_2025-09-23T15-20-14.009200.csv +0 -0
  13. eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-29T11-16-25.542329.json +141 -0
  14. eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-09-23T15-20-14.009200.json +141 -0
  15. eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-29T11-16-17.034884/details_extended|ifeval|0_2025-08-29T11-16-17.034884.csv +0 -0
  16. eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-09-23T15-20-14.861838/details_extended|ifeval|0_2025-09-23T15-20-14.861838.csv +0 -0
  17. eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-29T11-16-17.034884.json +141 -0
  18. eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-09-23T15-20-14.861838.json +141 -0
  19. eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-29T11-17-00.747010/details_extended|ifeval|0_2025-08-29T11-17-00.747010.csv +0 -0
  20. eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-09-23T15-18-49.877513/details_extended|ifeval|0_2025-09-23T15-18-49.877513.csv +0 -0
  21. eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-29T11-17-00.747010.json +141 -0
  22. eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-09-23T15-18-49.877513.json +141 -0
  23. eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-29T11-16-37.152415/details_extended|ifeval|0_2025-08-29T11-16-37.152415.csv +0 -0
  24. eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-09-23T15-23-52.120403/details_extended|ifeval|0_2025-09-23T15-23-52.120403.csv +0 -0
  25. eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-29T11-16-37.152415.json +141 -0
  26. eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-09-23T15-23-52.120403.json +141 -0
  27. eval_results_ood/global_step_5/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/2025-08-29T11-16-14.580600/details_extended|ifeval|0_2025-08-29T11-16-14.580600.csv +0 -0
  28. eval_results_ood/global_step_5/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/2025-09-23T15-18-54.425207/details_extended|ifeval|0_2025-09-23T15-18-54.425207.csv +0 -0
  29. eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-29T11-16-14.580600.json +141 -0
  30. eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-09-23T15-18-54.425207.json +141 -0
  31. eval_results_ood/results.csv +16 -0
  32. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  33. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  34. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  35. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  36. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  37. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  38. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  39. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  40. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  41. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  42. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  43. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  44. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  45. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  46. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  47. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  48. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  49. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  50. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_5--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
eval_results_ood/global_step_0/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/2025-08-29T11-17-46.747738/details_extended|ifeval|0_2025-08-29T11-17-46.747738.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_0/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/2025-09-23T15-20-09.172749/details_extended|ifeval|0_2025-09-23T15-20-09.172749.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-29T11-17-46.747738.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.60784555,
8
+ "end_time": 18884586.726209104,
9
+ "total_evaluation_time_secondes": "598.1183635555208",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.39371534195933455,
35
+ "prompt_level_strict_acc_stderr": 0.02102483414587237,
36
+ "inst_level_strict_acc": 0.5215827338129496,
37
+ "inst_level_strict_acc_stderr": 0.0005599264307527568,
38
+ "prompt_level_loose_acc": 0.4288354898336414,
39
+ "prompt_level_loose_acc_stderr": 0.021297522569050743,
40
+ "inst_level_loose_acc": 0.5563549160671463,
41
+ "inst_level_loose_acc_stderr": 0.0005527714546864814
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.39371534195933455,
45
+ "prompt_level_strict_acc_stderr": 0.02102483414587237,
46
+ "inst_level_strict_acc": 0.5215827338129496,
47
+ "inst_level_strict_acc_stderr": 0.0005599264307527568,
48
+ "prompt_level_loose_acc": 0.4288354898336414,
49
+ "prompt_level_loose_acc_stderr": 0.021297522569050743,
50
+ "inst_level_loose_acc": 0.5563549160671463,
51
+ "inst_level_loose_acc_stderr": 0.0005527714546864814
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "2fd4382748c56918"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "a571a26c128a03d4"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-09-23T15-20-09.172749.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.465284853,
8
+ "end_time": 2820247.173341552,
9
+ "total_evaluation_time_secondes": "352.7080566990189",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.39371534195933455,
35
+ "prompt_level_strict_acc_stderr": 0.02102483414587237,
36
+ "inst_level_strict_acc": 0.5191846522781774,
37
+ "inst_level_strict_acc_stderr": 0.0005633834275419194,
38
+ "prompt_level_loose_acc": 0.4399260628465804,
39
+ "prompt_level_loose_acc_stderr": 0.021360708220801976,
40
+ "inst_level_loose_acc": 0.564748201438849,
41
+ "inst_level_loose_acc_stderr": 0.0005587046293737301
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.39371534195933455,
45
+ "prompt_level_strict_acc_stderr": 0.02102483414587237,
46
+ "inst_level_strict_acc": 0.5191846522781774,
47
+ "inst_level_strict_acc_stderr": 0.0005633834275419194,
48
+ "prompt_level_loose_acc": 0.4399260628465804,
49
+ "prompt_level_loose_acc_stderr": 0.021360708220801976,
50
+ "inst_level_loose_acc": 0.564748201438849,
51
+ "inst_level_loose_acc_stderr": 0.0005587046293737301
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "f037b3f92aa6d818"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "8783819c7352f952"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_10/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/2025-09-23T15-19-07.845971/details_extended|ifeval|0_2025-09-23T15-19-07.845971.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-09-23T15-19-07.845971.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.524622857,
8
+ "end_time": 2820185.260255099,
9
+ "total_evaluation_time_secondes": "290.73563224170357",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.39186691312384475,
35
+ "prompt_level_strict_acc_stderr": 0.021007372406188354,
36
+ "inst_level_strict_acc": 0.5143884892086331,
37
+ "inst_level_strict_acc_stderr": 0.0005407156424898863,
38
+ "prompt_level_loose_acc": 0.43253234750462105,
39
+ "prompt_level_loose_acc_stderr": 0.02131979239817807,
40
+ "inst_level_loose_acc": 0.552757793764988,
41
+ "inst_level_loose_acc_stderr": 0.0005395531273954375
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.39186691312384475,
45
+ "prompt_level_strict_acc_stderr": 0.021007372406188354,
46
+ "inst_level_strict_acc": 0.5143884892086331,
47
+ "inst_level_strict_acc_stderr": 0.0005407156424898863,
48
+ "prompt_level_loose_acc": 0.43253234750462105,
49
+ "prompt_level_loose_acc_stderr": 0.02131979239817807,
50
+ "inst_level_loose_acc": 0.552757793764988,
51
+ "inst_level_loose_acc_stderr": 0.0005395531273954375
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "ca5820a7f4067bfa"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "4be167cbb4c1d0a3"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_15/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/2025-08-29T11-16-24.695422/details_extended|ifeval|0_2025-08-29T11-16-24.695422.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_15/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/2025-09-23T15-18-58.766285/details_extended|ifeval|0_2025-09-23T15-18-58.766285.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-29T11-16-24.695422.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.62186077,
8
+ "end_time": 18884503.508060712,
9
+ "total_evaluation_time_secondes": "514.8861999437213",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.3752310536044362,
35
+ "prompt_level_strict_acc_stderr": 0.020835898065035694,
36
+ "inst_level_strict_acc": 0.5095923261390888,
37
+ "inst_level_strict_acc_stderr": 0.0005432302541010317,
38
+ "prompt_level_loose_acc": 0.4269870609981516,
39
+ "prompt_level_loose_acc_stderr": 0.02128593305006131,
40
+ "inst_level_loose_acc": 0.5587529976019184,
41
+ "inst_level_loose_acc_stderr": 0.000545503372755182
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.3752310536044362,
45
+ "prompt_level_strict_acc_stderr": 0.020835898065035694,
46
+ "inst_level_strict_acc": 0.5095923261390888,
47
+ "inst_level_strict_acc_stderr": 0.0005432302541010317,
48
+ "prompt_level_loose_acc": 0.4269870609981516,
49
+ "prompt_level_loose_acc_stderr": 0.02128593305006131,
50
+ "inst_level_loose_acc": 0.5587529976019184,
51
+ "inst_level_loose_acc_stderr": 0.000545503372755182
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "af1056c06f4272cc"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "621f1d09b05aface"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-09-23T15-18-58.766285.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.516623436,
8
+ "end_time": 2820176.0063693,
9
+ "total_evaluation_time_secondes": "281.489745864179",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.37707948243992606,
35
+ "prompt_level_strict_acc_stderr": 0.020856233918528456,
36
+ "inst_level_strict_acc": 0.5155875299760192,
37
+ "inst_level_strict_acc_stderr": 0.0005624364712452048,
38
+ "prompt_level_loose_acc": 0.4343807763401109,
39
+ "prompt_level_loose_acc_stderr": 0.021330473657564727,
40
+ "inst_level_loose_acc": 0.5671462829736211,
41
+ "inst_level_loose_acc_stderr": 0.0005536778232943271
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.37707948243992606,
45
+ "prompt_level_strict_acc_stderr": 0.020856233918528456,
46
+ "inst_level_strict_acc": 0.5155875299760192,
47
+ "inst_level_strict_acc_stderr": 0.0005624364712452048,
48
+ "prompt_level_loose_acc": 0.4343807763401109,
49
+ "prompt_level_loose_acc_stderr": 0.021330473657564727,
50
+ "inst_level_loose_acc": 0.5671462829736211,
51
+ "inst_level_loose_acc_stderr": 0.0005536778232943271
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "b8a7398ee3d1035f"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "aaa3fa41573a6ce4"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-29T11-16-25.542329/details_extended|ifeval|0_2025-08-29T11-16-25.542329.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-09-23T15-20-14.009200/details_extended|ifeval|0_2025-09-23T15-20-14.009200.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-29T11-16-25.542329.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.60164322,
8
+ "end_time": 18884503.444405336,
9
+ "total_evaluation_time_secondes": "514.8427621163428",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.3807763401109057,
35
+ "prompt_level_strict_acc_stderr": 0.020895937888190833,
36
+ "inst_level_strict_acc": 0.511990407673861,
37
+ "inst_level_strict_acc_stderr": 0.0005284703064941477,
38
+ "prompt_level_loose_acc": 0.43807763401109057,
39
+ "prompt_level_loose_acc_stderr": 0.021350931135490935,
40
+ "inst_level_loose_acc": 0.5599520383693045,
41
+ "inst_level_loose_acc_stderr": 0.0005508806104783129
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.3807763401109057,
45
+ "prompt_level_strict_acc_stderr": 0.020895937888190833,
46
+ "inst_level_strict_acc": 0.511990407673861,
47
+ "inst_level_strict_acc_stderr": 0.0005284703064941477,
48
+ "prompt_level_loose_acc": 0.43807763401109057,
49
+ "prompt_level_loose_acc_stderr": 0.021350931135490935,
50
+ "inst_level_loose_acc": 0.5599520383693045,
51
+ "inst_level_loose_acc_stderr": 0.0005508806104783129
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "2c692a9c51322e27"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "56179f027b3dfdb2"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-09-23T15-20-14.009200.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.464804047,
8
+ "end_time": 2820250.597080663,
9
+ "total_evaluation_time_secondes": "356.13227661605924",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.4011090573012939,
35
+ "prompt_level_strict_acc_stderr": 0.02109153689552074,
36
+ "inst_level_strict_acc": 0.5143884892086331,
37
+ "inst_level_strict_acc_stderr": 0.0005584775478161843,
38
+ "prompt_level_loose_acc": 0.4510166358595194,
39
+ "prompt_level_loose_acc_stderr": 0.02141307276535979,
40
+ "inst_level_loose_acc": 0.5635491606714629,
41
+ "inst_level_loose_acc_stderr": 0.0005514767622336946
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.4011090573012939,
45
+ "prompt_level_strict_acc_stderr": 0.02109153689552074,
46
+ "inst_level_strict_acc": 0.5143884892086331,
47
+ "inst_level_strict_acc_stderr": 0.0005584775478161843,
48
+ "prompt_level_loose_acc": 0.4510166358595194,
49
+ "prompt_level_loose_acc_stderr": 0.02141307276535979,
50
+ "inst_level_loose_acc": 0.5635491606714629,
51
+ "inst_level_loose_acc_stderr": 0.0005514767622336946
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "f3dd45787c281c85"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "e7a70658b0e79a5e"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-29T11-16-17.034884/details_extended|ifeval|0_2025-08-29T11-16-17.034884.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-09-23T15-20-14.861838/details_extended|ifeval|0_2025-09-23T15-20-14.861838.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-29T11-16-17.034884.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.50661321,
8
+ "end_time": 18884494.977114405,
9
+ "total_evaluation_time_secondes": "506.4705011956394",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.35489833641404805,
35
+ "prompt_level_strict_acc_stderr": 0.020590608575926494,
36
+ "inst_level_strict_acc": 0.49640287769784175,
37
+ "inst_level_strict_acc_stderr": 0.0005353366435758254,
38
+ "prompt_level_loose_acc": 0.4195933456561922,
39
+ "prompt_level_loose_acc_stderr": 0.021236532548855144,
40
+ "inst_level_loose_acc": 0.5467625899280576,
41
+ "inst_level_loose_acc_stderr": 0.000542489217697426
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.35489833641404805,
45
+ "prompt_level_strict_acc_stderr": 0.020590608575926494,
46
+ "inst_level_strict_acc": 0.49640287769784175,
47
+ "inst_level_strict_acc_stderr": 0.0005353366435758254,
48
+ "prompt_level_loose_acc": 0.4195933456561922,
49
+ "prompt_level_loose_acc_stderr": 0.021236532548855144,
50
+ "inst_level_loose_acc": 0.5467625899280576,
51
+ "inst_level_loose_acc_stderr": 0.000542489217697426
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "b2ce952647c1913e"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "77b8312cfe427550"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-09-23T15-20-14.861838.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.464696055,
8
+ "end_time": 2820251.742639422,
9
+ "total_evaluation_time_secondes": "357.2779433671385",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.4121996303142329,
35
+ "prompt_level_strict_acc_stderr": 0.0211822381517332,
36
+ "inst_level_strict_acc": 0.5443645083932853,
37
+ "inst_level_strict_acc_stderr": 0.0005250797455930996,
38
+ "prompt_level_loose_acc": 0.46950092421441775,
39
+ "prompt_level_loose_acc_stderr": 0.02147650768114301,
40
+ "inst_level_loose_acc": 0.5911270983213429,
41
+ "inst_level_loose_acc_stderr": 0.0005233831979716518
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.4121996303142329,
45
+ "prompt_level_strict_acc_stderr": 0.0211822381517332,
46
+ "inst_level_strict_acc": 0.5443645083932853,
47
+ "inst_level_strict_acc_stderr": 0.0005250797455930996,
48
+ "prompt_level_loose_acc": 0.46950092421441775,
49
+ "prompt_level_loose_acc_stderr": 0.02147650768114301,
50
+ "inst_level_loose_acc": 0.5911270983213429,
51
+ "inst_level_loose_acc_stderr": 0.0005233831979716518
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "13df23e3d89d50d6"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "3aa74441e103e1f2"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-29T11-17-00.747010/details_extended|ifeval|0_2025-08-29T11-17-00.747010.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-09-23T15-18-49.877513/details_extended|ifeval|0_2025-09-23T15-18-49.877513.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-29T11-17-00.747010.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.700935464,
8
+ "end_time": 18884540.474919353,
9
+ "total_evaluation_time_secondes": "551.7739838883281",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.3844731977818854,
35
+ "prompt_level_strict_acc_stderr": 0.02093435763458466,
36
+ "inst_level_strict_acc": 0.5035971223021583,
37
+ "inst_level_strict_acc_stderr": 0.0005667334369309774,
38
+ "prompt_level_loose_acc": 0.4584103512014787,
39
+ "prompt_level_loose_acc_stderr": 0.021442010560476534,
40
+ "inst_level_loose_acc": 0.5671462829736211,
41
+ "inst_level_loose_acc_stderr": 0.0005740916373334755
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.3844731977818854,
45
+ "prompt_level_strict_acc_stderr": 0.02093435763458466,
46
+ "inst_level_strict_acc": 0.5035971223021583,
47
+ "inst_level_strict_acc_stderr": 0.0005667334369309774,
48
+ "prompt_level_loose_acc": 0.4584103512014787,
49
+ "prompt_level_loose_acc_stderr": 0.021442010560476534,
50
+ "inst_level_loose_acc": 0.5671462829736211,
51
+ "inst_level_loose_acc_stderr": 0.0005740916373334755
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "55fce4d54bcc603e"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "f20ba557778f234b"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-09-23T15-18-49.877513.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.526100757,
8
+ "end_time": 2820167.222449713,
9
+ "total_evaluation_time_secondes": "272.6963489558548",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.41589648798521256,
35
+ "prompt_level_strict_acc_stderr": 0.0212099993568188,
36
+ "inst_level_strict_acc": 0.5371702637889688,
37
+ "inst_level_strict_acc_stderr": 0.0005646791772078428,
38
+ "prompt_level_loose_acc": 0.4584103512014787,
39
+ "prompt_level_loose_acc_stderr": 0.02144201056047653,
40
+ "inst_level_loose_acc": 0.5815347721822542,
41
+ "inst_level_loose_acc_stderr": 0.0005489847919512223
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.41589648798521256,
45
+ "prompt_level_strict_acc_stderr": 0.0212099993568188,
46
+ "inst_level_strict_acc": 0.5371702637889688,
47
+ "inst_level_strict_acc_stderr": 0.0005646791772078428,
48
+ "prompt_level_loose_acc": 0.4584103512014787,
49
+ "prompt_level_loose_acc_stderr": 0.02144201056047653,
50
+ "inst_level_loose_acc": 0.5815347721822542,
51
+ "inst_level_loose_acc_stderr": 0.0005489847919512223
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "e2458b3184dfdb7c"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "22e60018f1f696a5"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-29T11-16-37.152415/details_extended|ifeval|0_2025-08-29T11-16-37.152415.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-09-23T15-23-52.120403/details_extended|ifeval|0_2025-09-23T15-23-52.120403.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-29T11-16-37.152415.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.616446614,
8
+ "end_time": 18884516.7507356,
9
+ "total_evaluation_time_secondes": "528.1342889852822",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.3752310536044362,
35
+ "prompt_level_strict_acc_stderr": 0.020835898065035694,
36
+ "inst_level_strict_acc": 0.5023980815347722,
37
+ "inst_level_strict_acc_stderr": 0.0005393909491033796,
38
+ "prompt_level_loose_acc": 0.4288354898336414,
39
+ "prompt_level_loose_acc_stderr": 0.021297522569050743,
40
+ "inst_level_loose_acc": 0.552757793764988,
41
+ "inst_level_loose_acc_stderr": 0.0005344148217521974
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.3752310536044362,
45
+ "prompt_level_strict_acc_stderr": 0.020835898065035694,
46
+ "inst_level_strict_acc": 0.5023980815347722,
47
+ "inst_level_strict_acc_stderr": 0.0005393909491033796,
48
+ "prompt_level_loose_acc": 0.4288354898336414,
49
+ "prompt_level_loose_acc_stderr": 0.021297522569050743,
50
+ "inst_level_loose_acc": 0.552757793764988,
51
+ "inst_level_loose_acc_stderr": 0.0005344148217521974
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "0f42ca4607b5eab0"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "f40d9eb07b829046"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-09-23T15-23-52.120403.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2820291.026089232,
8
+ "end_time": 2820469.627462913,
9
+ "total_evaluation_time_secondes": "178.60137368086725",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.4048059149722736,
35
+ "prompt_level_strict_acc_stderr": 0.021123012121053563,
36
+ "inst_level_strict_acc": 0.5287769784172662,
37
+ "inst_level_strict_acc_stderr": 0.0005365061017861071,
38
+ "prompt_level_loose_acc": 0.4565619223659889,
39
+ "prompt_level_loose_acc_stderr": 0.021435222545538896,
40
+ "inst_level_loose_acc": 0.5827338129496403,
41
+ "inst_level_loose_acc_stderr": 0.0005348025912540751
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.4048059149722736,
45
+ "prompt_level_strict_acc_stderr": 0.021123012121053563,
46
+ "inst_level_strict_acc": 0.5287769784172662,
47
+ "inst_level_strict_acc_stderr": 0.0005365061017861071,
48
+ "prompt_level_loose_acc": 0.4565619223659889,
49
+ "prompt_level_loose_acc_stderr": 0.021435222545538896,
50
+ "inst_level_loose_acc": 0.5827338129496403,
51
+ "inst_level_loose_acc_stderr": 0.0005348025912540751
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "1ecf208bbf533d01"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "643cc6202abfe217"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_5/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/2025-08-29T11-16-14.580600/details_extended|ifeval|0_2025-08-29T11-16-14.580600.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_5/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/2025-09-23T15-18-54.425207/details_extended|ifeval|0_2025-09-23T15-18-54.425207.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-29T11-16-14.580600.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18883988.61094902,
8
+ "end_time": 18884493.887714904,
9
+ "total_evaluation_time_secondes": "505.2767658829689",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.3826247689463956,
35
+ "prompt_level_strict_acc_stderr": 0.020915307841436644,
36
+ "inst_level_strict_acc": 0.5095923261390888,
37
+ "inst_level_strict_acc_stderr": 0.0005601452796683326,
38
+ "prompt_level_loose_acc": 0.4343807763401109,
39
+ "prompt_level_loose_acc_stderr": 0.021330473657564724,
40
+ "inst_level_loose_acc": 0.5707434052757794,
41
+ "inst_level_loose_acc_stderr": 0.0005481574868477439
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.3826247689463956,
45
+ "prompt_level_strict_acc_stderr": 0.020915307841436644,
46
+ "inst_level_strict_acc": 0.5095923261390888,
47
+ "inst_level_strict_acc_stderr": 0.0005601452796683326,
48
+ "prompt_level_loose_acc": 0.4343807763401109,
49
+ "prompt_level_loose_acc_stderr": 0.021330473657564724,
50
+ "inst_level_loose_acc": 0.5707434052757794,
51
+ "inst_level_loose_acc_stderr": 0.0005481574868477439
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "ad10e21c9427089f",
117
+ "hash_input_tokens": "76e5046dde448ac8",
118
+ "hash_cont_tokens": "7c8b4e41f6fd051b"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "edca08807dd5a8ec",
132
+ "hash_input_tokens": "6db7fa56ea148d89",
133
+ "hash_cont_tokens": "b1545cf2f315b64f"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-09-23T15-18-54.425207.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 2819894.607254071,
8
+ "end_time": 2820171.997573846,
9
+ "total_evaluation_time_secondes": "277.39031977485865",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.39186691312384475,
35
+ "prompt_level_strict_acc_stderr": 0.02100737240618835,
36
+ "inst_level_strict_acc": 0.5251798561151079,
37
+ "inst_level_strict_acc_stderr": 0.0005730653846434158,
38
+ "prompt_level_loose_acc": 0.43807763401109057,
39
+ "prompt_level_loose_acc_stderr": 0.021350931135490935,
40
+ "inst_level_loose_acc": 0.5695443645083933,
41
+ "inst_level_loose_acc_stderr": 0.0005581180833934133
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.39186691312384475,
45
+ "prompt_level_strict_acc_stderr": 0.02100737240618835,
46
+ "inst_level_strict_acc": 0.5251798561151079,
47
+ "inst_level_strict_acc_stderr": 0.0005730653846434158,
48
+ "prompt_level_loose_acc": 0.43807763401109057,
49
+ "prompt_level_loose_acc_stderr": 0.021350931135490935,
50
+ "inst_level_loose_acc": 0.5695443645083933,
51
+ "inst_level_loose_acc_stderr": 0.0005581180833934133
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "3972e282d6494c90",
117
+ "hash_input_tokens": "f702e1af092e9cc3",
118
+ "hash_cont_tokens": "e84762b7f983ecbc"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "b1fd186794cb0bb3",
132
+ "hash_input_tokens": "5fc219976d10c06f",
133
+ "hash_cont_tokens": "4dcc542128551811"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/results.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp_name,global_step,model_name,json_relpath,prompt_level_strict_acc,prompt_level_strict_acc_stderr,inst_level_strict_acc,inst_level_strict_acc_stderr,prompt_level_loose_acc,prompt_level_loose_acc_stderr,inst_level_loose_acc,inst_level_loose_acc_stderr,gpqa_pass@1:1_samples,gpqa_pass@1:1_samples_stderr
2
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,0,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-29T11-17-46.747738.json,0.39371534195933455,0.02102483414587237,0.5215827338129496,0.0005599264307527568,0.4288354898336414,0.021297522569050743,0.5563549160671463,0.0005527714546864814,,
3
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,0,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-09-23T15-20-09.172749.json,0.39371534195933455,0.02102483414587237,0.5191846522781774,0.0005633834275419194,0.4399260628465804,0.021360708220801976,0.564748201438849,0.0005587046293737301,,
4
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,5,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-29T11-16-14.580600.json,0.3826247689463956,0.020915307841436644,0.5095923261390888,0.0005601452796683326,0.4343807763401109,0.021330473657564724,0.5707434052757794,0.0005481574868477439,,
5
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,5,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-09-23T15-18-54.425207.json,0.39186691312384475,0.02100737240618835,0.5251798561151079,0.0005730653846434158,0.43807763401109057,0.021350931135490935,0.5695443645083933,0.0005581180833934133,,
6
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,10,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-09-23T15-19-07.845971.json,0.39186691312384475,0.021007372406188354,0.5143884892086331,0.0005407156424898863,0.43253234750462105,0.02131979239817807,0.552757793764988,0.0005395531273954375,,
7
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,15,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-29T11-16-24.695422.json,0.3752310536044362,0.020835898065035694,0.5095923261390888,0.0005432302541010317,0.4269870609981516,0.02128593305006131,0.5587529976019184,0.000545503372755182,,
8
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,15,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-09-23T15-18-58.766285.json,0.37707948243992606,0.020856233918528456,0.5155875299760192,0.0005624364712452048,0.4343807763401109,0.021330473657564727,0.5671462829736211,0.0005536778232943271,,
9
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,20,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-29T11-16-25.542329.json,0.3807763401109057,0.020895937888190833,0.511990407673861,0.0005284703064941477,0.43807763401109057,0.021350931135490935,0.5599520383693045,0.0005508806104783129,,
10
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,20,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-09-23T15-20-14.009200.json,0.4011090573012939,0.02109153689552074,0.5143884892086331,0.0005584775478161843,0.4510166358595194,0.02141307276535979,0.5635491606714629,0.0005514767622336946,,
11
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,25,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-29T11-16-17.034884.json,0.35489833641404805,0.020590608575926494,0.49640287769784175,0.0005353366435758254,0.4195933456561922,0.021236532548855144,0.5467625899280576,0.000542489217697426,,
12
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,25,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-09-23T15-20-14.861838.json,0.4121996303142329,0.0211822381517332,0.5443645083932853,0.0005250797455930996,0.46950092421441775,0.02147650768114301,0.5911270983213429,0.0005233831979716518,,
13
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,30,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-29T11-17-00.747010.json,0.3844731977818854,0.02093435763458466,0.5035971223021583,0.0005667334369309774,0.4584103512014787,0.021442010560476534,0.5671462829736211,0.0005740916373334755,,
14
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,30,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-09-23T15-18-49.877513.json,0.41589648798521256,0.0212099993568188,0.5371702637889688,0.0005646791772078428,0.4584103512014787,0.02144201056047653,0.5815347721822542,0.0005489847919512223,,
15
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,35,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-29T11-16-37.152415.json,0.3752310536044362,0.020835898065035694,0.5023980815347722,0.0005393909491033796,0.4288354898336414,0.021297522569050743,0.552757793764988,0.0005344148217521974,,
16
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15,35,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-09-23T15-23-52.120403.json,0.4048059149722736,0.021123012121053563,0.5287769784172662,0.0005365061017861071,0.4565619223659889,0.021435222545538896,0.5827338129496403,0.0005348025912540751,,
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_30--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_35--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_for_llama_Llama-3.2-1B-Instruct_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_5--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff