bensondccnqwc commited on
Commit
11c074c
·
verified ·
1 Parent(s): f93fc7d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results_avg4/global_step_45/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  2. eval_results_avg4/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  5. eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  7. eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  9. eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  11. eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  13. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  15. eval_results_merged/merged.csv +12 -0
  16. eval_results_merged_v3/merged.csv +12 -0
  17. eval_results_merged_v3/missing.txt +1 -0
  18. eval_results_merged_v4/merged.csv +12 -0
  19. eval_results_merged_v4/missing.txt +1 -0
  20. eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json +141 -0
  21. eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json +141 -0
  22. eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json +141 -0
  23. eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-28T14-31-58.919046/details_extended|ifeval|0_2025-08-28T14-31-58.919046.csv +0 -0
  24. eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json +141 -0
  25. eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-28T14-23-36.378741/details_extended|ifeval|0_2025-08-28T14-23-36.378741.csv +0 -0
  26. eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json +141 -0
  27. eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-28T14-23-16.687943/details_extended|ifeval|0_2025-08-28T14-23-16.687943.csv +0 -0
  28. eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json +141 -0
  29. eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-28T15-19-52.260147/details_extended|ifeval|0_2025-08-28T15-19-52.260147.csv +0 -0
  30. eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json +141 -0
  31. eval_results_ood/global_step_40/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/2025-08-28T15-18-51.746168/details_extended|ifeval|0_2025-08-28T15-18-51.746168.csv +0 -0
  32. eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json +141 -0
  33. eval_results_ood/global_step_45/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/2025-08-28T15-19-48.255403/details_extended|ifeval|0_2025-08-28T15-19-48.255403.csv +0 -0
  34. eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json +141 -0
  35. eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json +141 -0
  36. eval_results_ood/global_step_50/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/2025-08-28T15-18-25.365193/details_extended|ifeval|0_2025-08-28T15-18-25.365193.csv +0 -0
  37. eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json +141 -0
  38. eval_results_ood/results.csv +12 -0
  39. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  40. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  41. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  42. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  43. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  44. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  45. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  46. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  47. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
  48. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
  49. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
  50. evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
eval_results_avg4/global_step_45/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 5.0,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 182.274742603302,
14
+ "time_use_in_minite": "3:02"
15
+ }
eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 5.0,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 181.38263416290283,
14
+ "time_use_in_minite": "3:01"
15
+ }
eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 29.4,
7
+ "pass_acc": 57.5,
8
+ "pass@k": {
9
+ "1": 29.4,
10
+ "2": 42.5,
11
+ "4": 57.5
12
+ },
13
+ "time_use_in_second": 188.3577175140381,
14
+ "time_use_in_minite": "3:08"
15
+ }
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 6.7,
10
+ "2": 10.0,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 175.23715782165527,
14
+ "time_use_in_minite": "2:55"
15
+ }
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 2.5,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 2.5,
10
+ "2": 5.0,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 159.8093228340149,
14
+ "time_use_in_minite": "2:39"
15
+ }
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 26.9,
7
+ "pass_acc": 42.5,
8
+ "pass@k": {
9
+ "1": 26.9,
10
+ "2": 35.8,
11
+ "4": 42.5
12
+ },
13
+ "time_use_in_second": 147.4558162689209,
14
+ "time_use_in_minite": "2:27"
15
+ }
eval_results_merged/merged.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_step,aime24_acc_avg4,aime25_acc_avg4,amc23_acc_avg4,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
2
+ 0,2.50,2.50,20.00,3.30,0.00,17.50,62.50,46.40,13.60,17.60,40.30,20.9,
3
+ 5,3.30,3.30,29.40,3.30,3.30,35.00,69.40,53.40,20.20,21.00,43.00,17.7,
4
+ 10,5.80,4.20,30.00,0.00,3.30,15.00,76.30,56.80,16.50,20.70,44.10,21.6,
5
+ 15,5.00,2.50,31.20,0.00,3.30,25.00,78.20,58.40,23.90,24.90,45.00,26.1,
6
+ 20,3.30,1.70,32.50,6.70,3.30,32.50,78.50,57.20,25.00,24.90,47.70,24.6,
7
+ 25,6.70,3.30,33.80,3.30,3.30,37.50,77.80,59.00,22.80,23.00,49.20,25.7,
8
+ 30,5.80,3.30,31.20,3.30,0.00,32.50,79.40,60.40,24.60,25.00,50.40,26.2,
9
+ 35,4.20,1.70,34.40,3.30,6.70,30.00,80.30,62.20,25.00,25.30,51.60,25.0,
10
+ 40,7.50,4.20,33.10,3.30,0.00,40.00,79.20,60.20,25.40,26.50,53.70,27.4,
11
+ 45,5.00,2.50,32.50,6.70,0.00,22.50,80.10,60.00,25.70,25.50,53.50,27.0,
12
+ 50,6.70,2.50,26.90,3.30,3.30,32.50,80.80,58.80,26.50,24.10,53.80,25.9,
eval_results_merged_v3/merged.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_step,aime24_acc_avg16,aime25_acc_avg16,amc23_acc_avg16,aime24_acc_avg32,aime25_acc_avg32,amc23_acc_avg32,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
2
+ 0,,,21.90,,,,3.30,0.00,17.50,62.50,46.40,13.60,17.60,40.30,20.9,
3
+ 5,,,24.20,3.10,1.50,,3.30,3.30,35.00,69.40,53.40,20.20,21.00,43.00,17.7,
4
+ 10,,,31.40,5.00,2.90,,0.00,3.30,15.00,76.30,56.80,16.50,20.70,44.10,21.6,
5
+ 15,,,32.20,4.50,3.20,,0.00,3.30,25.00,78.20,58.40,23.90,24.90,45.00,26.1,
6
+ 20,,,35.00,5.10,2.50,,6.70,3.30,32.50,78.50,57.20,25.00,24.90,47.70,24.6,
7
+ 25,,,33.90,5.00,2.70,,3.30,3.30,37.50,77.80,59.00,22.80,23.00,49.20,25.7,
8
+ 30,,,30.90,6.10,2.40,,3.30,0.00,32.50,79.40,60.40,24.60,25.00,50.40,26.2,
9
+ 35,,,33.90,5.60,3.20,,3.30,6.70,30.00,80.30,62.20,25.00,25.30,51.60,25.0,
10
+ 40,,,34.70,5.30,2.20,,3.30,0.00,40.00,79.20,60.20,25.40,26.50,53.70,27.4,
11
+ 45,,,33.10,4.90,2.50,,6.70,0.00,22.50,80.10,60.00,25.70,25.50,53.50,27.0,
12
+ 50,,,34.10,5.30,3.30,,3.30,3.30,32.50,80.80,58.80,26.50,24.10,53.80,25.9,
eval_results_merged_v3/missing.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ step 0: missing avg32
eval_results_merged_v4/merged.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_step,amc23_acc_avg16,aime24_acc_avg32,aime25_acc_avg32,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,mbpp_base_pass@1,mbpp_plus_pass@1
2
+ 0,21.90,,,62.50,46.40,13.60,17.60,40.30,20.9,,
3
+ 5,24.20,3.10,1.50,69.40,53.40,20.20,21.00,43.00,17.7,54.4,45.6
4
+ 10,31.40,5.00,2.90,76.30,56.80,16.50,20.70,44.10,21.6,55.6,47.0
5
+ 15,32.20,4.50,3.20,78.20,58.40,23.90,24.90,45.00,26.1,55.0,46.1
6
+ 20,35.00,5.10,2.50,78.50,57.20,25.00,24.90,47.70,24.6,56.0,47.2
7
+ 25,33.90,5.00,2.70,77.80,59.00,22.80,23.00,49.20,25.7,57.1,48.2
8
+ 30,30.90,6.10,2.40,79.40,60.40,24.60,25.00,50.40,26.2,57.1,48.3
9
+ 35,33.90,5.60,3.20,80.30,62.20,25.00,25.30,51.60,25.0,54.6,46.5
10
+ 40,34.70,5.30,2.20,79.20,60.20,25.40,26.50,53.70,27.4,57.9,49.3
11
+ 45,33.10,4.90,2.50,80.10,60.00,25.70,25.50,53.50,27.0,56.7,48.3
12
+ 50,34.10,5.30,3.30,80.80,58.80,26.50,24.10,53.80,25.9,56.0,47.4
eval_results_merged_v4/missing.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ step 0: missing avg32, mbpp
eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.63711604,
8
+ "end_time": 18811748.076254945,
9
+ "total_evaluation_time_secondes": "3870.439138904214",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.2088724584103512,
35
+ "prompt_level_strict_acc_stderr": 0.017493107347793312,
36
+ "inst_level_strict_acc": 0.3249400479616307,
37
+ "inst_level_strict_acc_stderr": 0.0005020381027360004,
38
+ "prompt_level_loose_acc": 0.22181146025878004,
39
+ "prompt_level_loose_acc_stderr": 0.01787876540794439,
40
+ "inst_level_loose_acc": 0.34652278177458035,
41
+ "inst_level_loose_acc_stderr": 0.0005100828283216712
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.2088724584103512,
45
+ "prompt_level_strict_acc_stderr": 0.017493107347793312,
46
+ "inst_level_strict_acc": 0.3249400479616307,
47
+ "inst_level_strict_acc_stderr": 0.0005020381027360004,
48
+ "prompt_level_loose_acc": 0.22181146025878004,
49
+ "prompt_level_loose_acc_stderr": 0.01787876540794439,
50
+ "inst_level_loose_acc": 0.34652278177458035,
51
+ "inst_level_loose_acc_stderr": 0.0005100828283216712
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "5e36dc1464d8a69a"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "868acc3b1dc606da"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.70384178,
8
+ "end_time": 18810538.169428077,
9
+ "total_evaluation_time_secondes": "2660.465586297214",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.21626617375231053,
35
+ "prompt_level_strict_acc_stderr": 0.01771665442891014,
36
+ "inst_level_strict_acc": 0.3333333333333333,
37
+ "inst_level_strict_acc_stderr": 0.000489431816288728,
38
+ "prompt_level_loose_acc": 0.25508317929759705,
39
+ "prompt_level_loose_acc_stderr": 0.018758491950414142,
40
+ "inst_level_loose_acc": 0.36810551558752996,
41
+ "inst_level_loose_acc_stderr": 0.0005234324323312584
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.21626617375231053,
45
+ "prompt_level_strict_acc_stderr": 0.01771665442891014,
46
+ "inst_level_strict_acc": 0.3333333333333333,
47
+ "inst_level_strict_acc_stderr": 0.000489431816288728,
48
+ "prompt_level_loose_acc": 0.25508317929759705,
49
+ "prompt_level_loose_acc_stderr": 0.018758491950414142,
50
+ "inst_level_loose_acc": 0.36810551558752996,
51
+ "inst_level_loose_acc_stderr": 0.0005234324323312584
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "7575bde790fe6f08"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "87fc0142e87ea1db"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.635745857,
8
+ "end_time": 18810366.315407366,
9
+ "total_evaluation_time_secondes": "2488.6796615086496",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.26062846580406657,
35
+ "prompt_level_strict_acc_stderr": 0.018890584986760273,
36
+ "inst_level_strict_acc": 0.3752997601918465,
37
+ "inst_level_strict_acc_stderr": 0.0005276795699129097,
38
+ "prompt_level_loose_acc": 0.28835489833641403,
39
+ "prompt_level_loose_acc_stderr": 0.019493890350654703,
40
+ "inst_level_loose_acc": 0.4028776978417266,
41
+ "inst_level_loose_acc_stderr": 0.0005295500141012543
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.26062846580406657,
45
+ "prompt_level_strict_acc_stderr": 0.018890584986760273,
46
+ "inst_level_strict_acc": 0.3752997601918465,
47
+ "inst_level_strict_acc_stderr": 0.0005276795699129097,
48
+ "prompt_level_loose_acc": 0.28835489833641403,
49
+ "prompt_level_loose_acc_stderr": 0.019493890350654703,
50
+ "inst_level_loose_acc": 0.4028776978417266,
51
+ "inst_level_loose_acc_stderr": 0.0005295500141012543
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "6a3d0652153c9bb7"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "6417b61d11ee28c3"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-28T14-31-58.919046/details_extended|ifeval|0_2025-08-28T14-31-58.919046.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.704473533,
8
+ "end_time": 18809835.16786196,
9
+ "total_evaluation_time_secondes": "1957.463388428092",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.24584103512014788,
35
+ "prompt_level_strict_acc_stderr": 0.01852941708079558,
36
+ "inst_level_strict_acc": 0.35851318944844124,
37
+ "inst_level_strict_acc_stderr": 0.0005177367706909966,
38
+ "prompt_level_loose_acc": 0.2846580406654344,
39
+ "prompt_level_loose_acc_stderr": 0.019418769106486003,
40
+ "inst_level_loose_acc": 0.40167865707434053,
41
+ "inst_level_loose_acc_stderr": 0.0005308235859063865
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.24584103512014788,
45
+ "prompt_level_strict_acc_stderr": 0.01852941708079558,
46
+ "inst_level_strict_acc": 0.35851318944844124,
47
+ "inst_level_strict_acc_stderr": 0.0005177367706909966,
48
+ "prompt_level_loose_acc": 0.2846580406654344,
49
+ "prompt_level_loose_acc_stderr": 0.019418769106486003,
50
+ "inst_level_loose_acc": 0.40167865707434053,
51
+ "inst_level_loose_acc_stderr": 0.0005308235859063865
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "9e63b6982edb60a3"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "7f0bbce7193e5835"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-28T14-23-36.378741/details_extended|ifeval|0_2025-08-28T14-23-36.378741.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.63665532,
8
+ "end_time": 18809334.473295175,
9
+ "total_evaluation_time_secondes": "1456.836639855057",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.25693160813308685,
35
+ "prompt_level_strict_acc_stderr": 0.018802962575636847,
36
+ "inst_level_strict_acc": 0.381294964028777,
37
+ "inst_level_strict_acc_stderr": 0.0005088515258209939,
38
+ "prompt_level_loose_acc": 0.29944547134935307,
39
+ "prompt_level_loose_acc_stderr": 0.019709834029672916,
40
+ "inst_level_loose_acc": 0.4292565947242206,
41
+ "inst_level_loose_acc_stderr": 0.0005359650127318092
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.25693160813308685,
45
+ "prompt_level_strict_acc_stderr": 0.018802962575636847,
46
+ "inst_level_strict_acc": 0.381294964028777,
47
+ "inst_level_strict_acc_stderr": 0.0005088515258209939,
48
+ "prompt_level_loose_acc": 0.29944547134935307,
49
+ "prompt_level_loose_acc_stderr": 0.019709834029672916,
50
+ "inst_level_loose_acc": 0.4292565947242206,
51
+ "inst_level_loose_acc_stderr": 0.0005359650127318092
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "300fbc30a20ca56d"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "8ed3bd6852070947"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-28T14-23-16.687943/details_extended|ifeval|0_2025-08-28T14-23-16.687943.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.636172168,
8
+ "end_time": 18809314.862311047,
9
+ "total_evaluation_time_secondes": "1437.2261388786137",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.26247689463955637,
35
+ "prompt_level_strict_acc_stderr": 0.018933742876044622,
36
+ "inst_level_strict_acc": 0.3980815347721823,
37
+ "inst_level_strict_acc_stderr": 0.0005233210912873605,
38
+ "prompt_level_loose_acc": 0.31053604436229204,
39
+ "prompt_level_loose_acc_stderr": 0.019912001290591244,
40
+ "inst_level_loose_acc": 0.447242206235012,
41
+ "inst_level_loose_acc_stderr": 0.000530725608886124
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.26247689463955637,
45
+ "prompt_level_strict_acc_stderr": 0.018933742876044622,
46
+ "inst_level_strict_acc": 0.3980815347721823,
47
+ "inst_level_strict_acc_stderr": 0.0005233210912873605,
48
+ "prompt_level_loose_acc": 0.31053604436229204,
49
+ "prompt_level_loose_acc_stderr": 0.019912001290591244,
50
+ "inst_level_loose_acc": 0.447242206235012,
51
+ "inst_level_loose_acc_stderr": 0.000530725608886124
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "8f6719798d9174bd"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "66197f09ecf3792e"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-28T15-19-52.260147/details_extended|ifeval|0_2025-08-28T15-19-52.260147.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18811863.0122587,
8
+ "end_time": 18812710.173764117,
9
+ "total_evaluation_time_secondes": "847.1615054160357",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.24953789279112754,
35
+ "prompt_level_strict_acc_stderr": 0.018622404509805863,
36
+ "inst_level_strict_acc": 0.37889688249400477,
37
+ "inst_level_strict_acc_stderr": 0.0005113738731364967,
38
+ "prompt_level_loose_acc": 0.3142329020332717,
39
+ "prompt_level_loose_acc_stderr": 0.01997640206515067,
40
+ "inst_level_loose_acc": 0.4448441247002398,
41
+ "inst_level_loose_acc_stderr": 0.0005297346220956167
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.24953789279112754,
45
+ "prompt_level_strict_acc_stderr": 0.018622404509805863,
46
+ "inst_level_strict_acc": 0.37889688249400477,
47
+ "inst_level_strict_acc_stderr": 0.0005113738731364967,
48
+ "prompt_level_loose_acc": 0.3142329020332717,
49
+ "prompt_level_loose_acc_stderr": 0.01997640206515067,
50
+ "inst_level_loose_acc": 0.4448441247002398,
51
+ "inst_level_loose_acc_stderr": 0.0005297346220956167
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "115a2ac2647bf2e6"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "a552e71cdd179caa"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_40/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/2025-08-28T15-18-51.746168/details_extended|ifeval|0_2025-08-28T15-18-51.746168.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18811863.02156565,
8
+ "end_time": 18812650.77936895,
9
+ "total_evaluation_time_secondes": "787.757803298533",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.2735674676524954,
35
+ "prompt_level_strict_acc_stderr": 0.019183727107392825,
36
+ "inst_level_strict_acc": 0.41247002398081534,
37
+ "inst_level_strict_acc_stderr": 0.000532554469072579,
38
+ "prompt_level_loose_acc": 0.34750462107208874,
39
+ "prompt_level_loose_acc_stderr": 0.02049142365341572,
40
+ "inst_level_loose_acc": 0.4832134292565947,
41
+ "inst_level_loose_acc_stderr": 0.0005287169246309893
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.2735674676524954,
45
+ "prompt_level_strict_acc_stderr": 0.019183727107392825,
46
+ "inst_level_strict_acc": 0.41247002398081534,
47
+ "inst_level_strict_acc_stderr": 0.000532554469072579,
48
+ "prompt_level_loose_acc": 0.34750462107208874,
49
+ "prompt_level_loose_acc_stderr": 0.02049142365341572,
50
+ "inst_level_loose_acc": 0.4832134292565947,
51
+ "inst_level_loose_acc_stderr": 0.0005287169246309893
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "88ff4ce1dc04a1a1"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "2f831f811e8d0b88"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_45/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/2025-08-28T15-19-48.255403/details_extended|ifeval|0_2025-08-28T15-19-48.255403.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18811863.018329486,
8
+ "end_time": 18812707.18770021,
9
+ "total_evaluation_time_secondes": "844.1693707220256",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.2698706099815157,
35
+ "prompt_level_strict_acc_stderr": 0.019102087526494303,
36
+ "inst_level_strict_acc": 0.407673860911271,
37
+ "inst_level_strict_acc_stderr": 0.0005080955062228919,
38
+ "prompt_level_loose_acc": 0.3456561922365989,
39
+ "prompt_level_loose_acc_stderr": 0.02046577943268255,
40
+ "inst_level_loose_acc": 0.48081534772182255,
41
+ "inst_level_loose_acc_stderr": 0.0005250291013858589
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.2698706099815157,
45
+ "prompt_level_strict_acc_stderr": 0.019102087526494303,
46
+ "inst_level_strict_acc": 0.407673860911271,
47
+ "inst_level_strict_acc_stderr": 0.0005080955062228919,
48
+ "prompt_level_loose_acc": 0.3456561922365989,
49
+ "prompt_level_loose_acc_stderr": 0.02046577943268255,
50
+ "inst_level_loose_acc": 0.48081534772182255,
51
+ "inst_level_loose_acc_stderr": 0.0005250291013858589
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "c81b0908f6d01a01"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "11c1fdf8fbfac4f1"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18807877.63640212,
8
+ "end_time": 18811337.61977247,
9
+ "total_evaluation_time_secondes": "3459.9833703525364",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.17744916820702403,
35
+ "prompt_level_strict_acc_stderr": 0.016440744379699793,
36
+ "inst_level_strict_acc": 0.30935251798561153,
37
+ "inst_level_strict_acc_stderr": 0.0004545789816041833,
38
+ "prompt_level_loose_acc": 0.19408502772643252,
39
+ "prompt_level_loose_acc_stderr": 0.0170193805507494,
40
+ "inst_level_loose_acc": 0.3225419664268585,
41
+ "inst_level_loose_acc_stderr": 0.0004593128188518521
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.17744916820702403,
45
+ "prompt_level_strict_acc_stderr": 0.016440744379699793,
46
+ "inst_level_strict_acc": 0.30935251798561153,
47
+ "inst_level_strict_acc_stderr": 0.0004545789816041833,
48
+ "prompt_level_loose_acc": 0.19408502772643252,
49
+ "prompt_level_loose_acc_stderr": 0.0170193805507494,
50
+ "inst_level_loose_acc": 0.3225419664268585,
51
+ "inst_level_loose_acc_stderr": 0.0004593128188518521
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "cb86db8b46b3063a"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "1ff5cba463b1840d"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/global_step_50/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/2025-08-28T15-18-25.365193/details_extended|ifeval|0_2025-08-28T15-18-25.365193.csv ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 18811863.116520293,
8
+ "end_time": 18812624.911301415,
9
+ "total_evaluation_time_secondes": "761.7947811223567",
10
+ "model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 1.0,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "extended|ifeval|0": {
34
+ "prompt_level_strict_acc": 0.2587800369685767,
35
+ "prompt_level_strict_acc_stderr": 0.018846992560712535,
36
+ "inst_level_strict_acc": 0.38848920863309355,
37
+ "inst_level_strict_acc_stderr": 0.0005002404333580301,
38
+ "prompt_level_loose_acc": 0.3419593345656192,
39
+ "prompt_level_loose_acc_stderr": 0.020413464513923615,
40
+ "inst_level_loose_acc": 0.4676258992805755,
41
+ "inst_level_loose_acc_stderr": 0.0005162613395928805
42
+ },
43
+ "all": {
44
+ "prompt_level_strict_acc": 0.2587800369685767,
45
+ "prompt_level_strict_acc_stderr": 0.018846992560712535,
46
+ "inst_level_strict_acc": 0.38848920863309355,
47
+ "inst_level_strict_acc_stderr": 0.0005002404333580301,
48
+ "prompt_level_loose_acc": 0.3419593345656192,
49
+ "prompt_level_loose_acc_stderr": 0.020413464513923615,
50
+ "inst_level_loose_acc": 0.4676258992805755,
51
+ "inst_level_loose_acc_stderr": 0.0005162613395928805
52
+ }
53
+ },
54
+ "versions": {
55
+ "extended|ifeval|0": "0.1"
56
+ },
57
+ "config_tasks": {
58
+ "extended|ifeval": {
59
+ "name": "ifeval",
60
+ "prompt_function": "ifeval_prompt",
61
+ "hf_repo": "google/IFEval",
62
+ "hf_subset": "default",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "prompt_level_strict_acc",
67
+ "inst_level_strict_acc",
68
+ "prompt_level_loose_acc",
69
+ "inst_level_loose_acc"
70
+ ],
71
+ "higher_is_better": {
72
+ "prompt_level_strict_acc": true,
73
+ "inst_level_strict_acc": true,
74
+ "prompt_level_loose_acc": true,
75
+ "inst_level_loose_acc": true
76
+ },
77
+ "category": "3",
78
+ "use_case": "1",
79
+ "sample_level_fn": "ifeval_metric",
80
+ "corpus_level_fn": {
81
+ "prompt_level_strict_acc": "mean",
82
+ "inst_level_strict_acc": "agg_inst_level_acc",
83
+ "prompt_level_loose_acc": "mean",
84
+ "inst_level_loose_acc": "agg_inst_level_acc"
85
+ }
86
+ }
87
+ ],
88
+ "hf_revision": null,
89
+ "hf_filter": null,
90
+ "hf_avail_splits": [
91
+ "train"
92
+ ],
93
+ "trust_dataset": false,
94
+ "evaluation_splits": [
95
+ "train"
96
+ ],
97
+ "few_shots_split": "train",
98
+ "few_shots_select": "random_sampling",
99
+ "generation_size": 1280,
100
+ "generation_grammar": null,
101
+ "stop_sequence": [],
102
+ "num_samples": null,
103
+ "suite": [
104
+ "extended"
105
+ ],
106
+ "original_num_docs": 541,
107
+ "effective_num_docs": 541,
108
+ "must_remove_duplicate_docs": false,
109
+ "version": "0.1"
110
+ }
111
+ },
112
+ "summary_tasks": {
113
+ "extended|ifeval|0": {
114
+ "hashes": {
115
+ "hash_examples": "e99cbf567588d7c6",
116
+ "hash_full_prompts": "7ea7bf2a8edba8f4",
117
+ "hash_input_tokens": "e3d19e04074f1062",
118
+ "hash_cont_tokens": "639f8c16b4bbe76d"
119
+ },
120
+ "truncated": 0,
121
+ "non_truncated": 541,
122
+ "padded": 0,
123
+ "non_padded": 541,
124
+ "effective_few_shots": 0.0,
125
+ "num_truncated_few_shots": 0
126
+ }
127
+ },
128
+ "summary_general": {
129
+ "hashes": {
130
+ "hash_examples": "ea046ab2c6fc5928",
131
+ "hash_full_prompts": "45f8422f6ad2da79",
132
+ "hash_input_tokens": "32d769c21a57d2c7",
133
+ "hash_cont_tokens": "602d2eb1bbdd7950"
134
+ },
135
+ "truncated": 0,
136
+ "non_truncated": 541,
137
+ "padded": 0,
138
+ "non_padded": 541,
139
+ "num_truncated_few_shots": 0
140
+ }
141
+ }
eval_results_ood/results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp_name,global_step,model_name,json_relpath,prompt_level_strict_acc,prompt_level_strict_acc_stderr,inst_level_strict_acc,inst_level_strict_acc_stderr,prompt_level_loose_acc,prompt_level_loose_acc_stderr,inst_level_loose_acc,inst_level_loose_acc_stderr,gpqa_pass@1:1_samples,gpqa_pass@1:1_samples_stderr
2
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,0,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json,0.2088724584103512,0.017493107347793312,0.3249400479616307,0.0005020381027360004,0.22181146025878004,0.01787876540794439,0.34652278177458035,0.0005100828283216712,,
3
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,5,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json,0.17744916820702403,0.016440744379699793,0.30935251798561153,0.0004545789816041833,0.19408502772643252,0.0170193805507494,0.3225419664268585,0.0004593128188518521,,
4
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,10,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json,0.21626617375231053,0.01771665442891014,0.3333333333333333,0.000489431816288728,0.25508317929759705,0.018758491950414142,0.36810551558752996,0.0005234324323312584,,
5
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,15,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json,0.26062846580406657,0.018890584986760273,0.3752997601918465,0.0005276795699129097,0.28835489833641403,0.019493890350654703,0.4028776978417266,0.0005295500141012543,,
6
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,20,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json,0.24584103512014788,0.01852941708079558,0.35851318944844124,0.0005177367706909966,0.2846580406654344,0.019418769106486003,0.40167865707434053,0.0005308235859063865,,
7
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,25,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json,0.25693160813308685,0.018802962575636847,0.381294964028777,0.0005088515258209939,0.29944547134935307,0.019709834029672916,0.4292565947242206,0.0005359650127318092,,
8
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,30,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json,0.26247689463955637,0.018933742876044622,0.3980815347721823,0.0005233210912873605,0.31053604436229204,0.019912001290591244,0.447242206235012,0.000530725608886124,,
9
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,35,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json,0.24953789279112754,0.018622404509805863,0.37889688249400477,0.0005113738731364967,0.3142329020332717,0.01997640206515067,0.4448441247002398,0.0005297346220956167,,
10
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,40,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json,0.2735674676524954,0.019183727107392825,0.41247002398081534,0.000532554469072579,0.34750462107208874,0.02049142365341572,0.4832134292565947,0.0005287169246309893,,
11
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,45,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json,0.2698706099815157,0.019102087526494303,0.407673860911271,0.0005080955062228919,0.3456561922365989,0.02046577943268255,0.48081534772182255,0.0005250291013858589,,
12
+ verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,50,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json,0.2587800369685767,0.018846992560712535,0.38848920863309355,0.0005002404333580301,0.3419593345656192,0.020413464513923615,0.4676258992805755,0.0005162613395928805,,
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl ADDED
The diff for this file is too large to render. See raw diff