Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- eval_results_avg4/global_step_45/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
- eval_results_merged/merged.csv +12 -0
- eval_results_merged_v3/merged.csv +12 -0
- eval_results_merged_v3/missing.txt +1 -0
- eval_results_merged_v4/merged.csv +12 -0
- eval_results_merged_v4/missing.txt +1 -0
- eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json +141 -0
- eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json +141 -0
- eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json +141 -0
- eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-28T14-31-58.919046/details_extended|ifeval|0_2025-08-28T14-31-58.919046.csv +0 -0
- eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json +141 -0
- eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-28T14-23-36.378741/details_extended|ifeval|0_2025-08-28T14-23-36.378741.csv +0 -0
- eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json +141 -0
- eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-28T14-23-16.687943/details_extended|ifeval|0_2025-08-28T14-23-16.687943.csv +0 -0
- eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json +141 -0
- eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-28T15-19-52.260147/details_extended|ifeval|0_2025-08-28T15-19-52.260147.csv +0 -0
- eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json +141 -0
- eval_results_ood/global_step_40/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/2025-08-28T15-18-51.746168/details_extended|ifeval|0_2025-08-28T15-18-51.746168.csv +0 -0
- eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json +141 -0
- eval_results_ood/global_step_45/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/2025-08-28T15-19-48.255403/details_extended|ifeval|0_2025-08-28T15-19-48.255403.csv +0 -0
- eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json +141 -0
- eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json +141 -0
- eval_results_ood/global_step_50/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/2025-08-28T15-18-25.365193/details_extended|ifeval|0_2025-08-28T15-18-25.365193.csv +0 -0
- eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json +141 -0
- eval_results_ood/results.csv +12 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl +0 -0
- evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl +0 -0
eval_results_avg4/global_step_45/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_5/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 182.274742603302,
|
| 14 |
+
"time_use_in_minite": "3:02"
|
| 15 |
+
}
|
eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_5/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 6.7
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 181.38263416290283,
|
| 14 |
+
"time_use_in_minite": "3:01"
|
| 15 |
+
}
|
eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_5/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 29.4,
|
| 7 |
+
"pass_acc": 57.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 29.4,
|
| 10 |
+
"2": 42.5,
|
| 11 |
+
"4": 57.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 188.3577175140381,
|
| 14 |
+
"time_use_in_minite": "3:08"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7,
|
| 10 |
+
"2": 10.0,
|
| 11 |
+
"4": 13.3
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 175.23715782165527,
|
| 14 |
+
"time_use_in_minite": "2:55"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 120,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 2.5,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 2.5,
|
| 10 |
+
"2": 5.0,
|
| 11 |
+
"4": 10.0
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 159.8093228340149,
|
| 14 |
+
"time_use_in_minite": "2:39"
|
| 15 |
+
}
|
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_avg4/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 160,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 26.9,
|
| 7 |
+
"pass_acc": 42.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 26.9,
|
| 10 |
+
"2": 35.8,
|
| 11 |
+
"4": 42.5
|
| 12 |
+
},
|
| 13 |
+
"time_use_in_second": 147.4558162689209,
|
| 14 |
+
"time_use_in_minite": "2:27"
|
| 15 |
+
}
|
eval_results_merged/merged.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global_step,aime24_acc_avg4,aime25_acc_avg4,amc23_acc_avg4,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
|
| 2 |
+
0,2.50,2.50,20.00,3.30,0.00,17.50,62.50,46.40,13.60,17.60,40.30,20.9,
|
| 3 |
+
5,3.30,3.30,29.40,3.30,3.30,35.00,69.40,53.40,20.20,21.00,43.00,17.7,
|
| 4 |
+
10,5.80,4.20,30.00,0.00,3.30,15.00,76.30,56.80,16.50,20.70,44.10,21.6,
|
| 5 |
+
15,5.00,2.50,31.20,0.00,3.30,25.00,78.20,58.40,23.90,24.90,45.00,26.1,
|
| 6 |
+
20,3.30,1.70,32.50,6.70,3.30,32.50,78.50,57.20,25.00,24.90,47.70,24.6,
|
| 7 |
+
25,6.70,3.30,33.80,3.30,3.30,37.50,77.80,59.00,22.80,23.00,49.20,25.7,
|
| 8 |
+
30,5.80,3.30,31.20,3.30,0.00,32.50,79.40,60.40,24.60,25.00,50.40,26.2,
|
| 9 |
+
35,4.20,1.70,34.40,3.30,6.70,30.00,80.30,62.20,25.00,25.30,51.60,25.0,
|
| 10 |
+
40,7.50,4.20,33.10,3.30,0.00,40.00,79.20,60.20,25.40,26.50,53.70,27.4,
|
| 11 |
+
45,5.00,2.50,32.50,6.70,0.00,22.50,80.10,60.00,25.70,25.50,53.50,27.0,
|
| 12 |
+
50,6.70,2.50,26.90,3.30,3.30,32.50,80.80,58.80,26.50,24.10,53.80,25.9,
|
eval_results_merged_v3/merged.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global_step,aime24_acc_avg16,aime25_acc_avg16,amc23_acc_avg16,aime24_acc_avg32,aime25_acc_avg32,amc23_acc_avg32,aime24_acc,aime25_acc,amc23_acc,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,gpqa_pass@1:1_samples_ood
|
| 2 |
+
0,,,21.90,,,,3.30,0.00,17.50,62.50,46.40,13.60,17.60,40.30,20.9,
|
| 3 |
+
5,,,24.20,3.10,1.50,,3.30,3.30,35.00,69.40,53.40,20.20,21.00,43.00,17.7,
|
| 4 |
+
10,,,31.40,5.00,2.90,,0.00,3.30,15.00,76.30,56.80,16.50,20.70,44.10,21.6,
|
| 5 |
+
15,,,32.20,4.50,3.20,,0.00,3.30,25.00,78.20,58.40,23.90,24.90,45.00,26.1,
|
| 6 |
+
20,,,35.00,5.10,2.50,,6.70,3.30,32.50,78.50,57.20,25.00,24.90,47.70,24.6,
|
| 7 |
+
25,,,33.90,5.00,2.70,,3.30,3.30,37.50,77.80,59.00,22.80,23.00,49.20,25.7,
|
| 8 |
+
30,,,30.90,6.10,2.40,,3.30,0.00,32.50,79.40,60.40,24.60,25.00,50.40,26.2,
|
| 9 |
+
35,,,33.90,5.60,3.20,,3.30,6.70,30.00,80.30,62.20,25.00,25.30,51.60,25.0,
|
| 10 |
+
40,,,34.70,5.30,2.20,,3.30,0.00,40.00,79.20,60.20,25.40,26.50,53.70,27.4,
|
| 11 |
+
45,,,33.10,4.90,2.50,,6.70,0.00,22.50,80.10,60.00,25.70,25.50,53.50,27.0,
|
| 12 |
+
50,,,34.10,5.30,3.30,,3.30,3.30,32.50,80.80,58.80,26.50,24.10,53.80,25.9,
|
eval_results_merged_v3/missing.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step 0: missing avg32
|
eval_results_merged_v4/merged.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global_step,amc23_acc_avg16,aime24_acc_avg32,aime25_acc_avg32,gsm8k_acc,math500_acc,minerva_math_acc,olympiadbench_acc,mmlu_stem_acc,prompt_level_strict_acc_ood,mbpp_base_pass@1,mbpp_plus_pass@1
|
| 2 |
+
0,21.90,,,62.50,46.40,13.60,17.60,40.30,20.9,,
|
| 3 |
+
5,24.20,3.10,1.50,69.40,53.40,20.20,21.00,43.00,17.7,54.4,45.6
|
| 4 |
+
10,31.40,5.00,2.90,76.30,56.80,16.50,20.70,44.10,21.6,55.6,47.0
|
| 5 |
+
15,32.20,4.50,3.20,78.20,58.40,23.90,24.90,45.00,26.1,55.0,46.1
|
| 6 |
+
20,35.00,5.10,2.50,78.50,57.20,25.00,24.90,47.70,24.6,56.0,47.2
|
| 7 |
+
25,33.90,5.00,2.70,77.80,59.00,22.80,23.00,49.20,25.7,57.1,48.2
|
| 8 |
+
30,30.90,6.10,2.40,79.40,60.40,24.60,25.00,50.40,26.2,57.1,48.3
|
| 9 |
+
35,33.90,5.60,3.20,80.30,62.20,25.00,25.30,51.60,25.0,54.6,46.5
|
| 10 |
+
40,34.70,5.30,2.20,79.20,60.20,25.40,26.50,53.70,27.4,57.9,49.3
|
| 11 |
+
45,33.10,4.90,2.50,80.10,60.00,25.70,25.50,53.50,27.0,56.7,48.3
|
| 12 |
+
50,34.10,5.30,3.30,80.80,58.80,26.50,24.10,53.80,25.9,56.0,47.4
|
eval_results_merged_v4/missing.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step 0: missing avg32, mbpp
|
eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.63711604,
|
| 8 |
+
"end_time": 18811748.076254945,
|
| 9 |
+
"total_evaluation_time_secondes": "3870.439138904214",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.2088724584103512,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.017493107347793312,
|
| 36 |
+
"inst_level_strict_acc": 0.3249400479616307,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005020381027360004,
|
| 38 |
+
"prompt_level_loose_acc": 0.22181146025878004,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.01787876540794439,
|
| 40 |
+
"inst_level_loose_acc": 0.34652278177458035,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005100828283216712
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.2088724584103512,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.017493107347793312,
|
| 46 |
+
"inst_level_strict_acc": 0.3249400479616307,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005020381027360004,
|
| 48 |
+
"prompt_level_loose_acc": 0.22181146025878004,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.01787876540794439,
|
| 50 |
+
"inst_level_loose_acc": 0.34652278177458035,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005100828283216712
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "5e36dc1464d8a69a"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "868acc3b1dc606da"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.70384178,
|
| 8 |
+
"end_time": 18810538.169428077,
|
| 9 |
+
"total_evaluation_time_secondes": "2660.465586297214",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.21626617375231053,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.01771665442891014,
|
| 36 |
+
"inst_level_strict_acc": 0.3333333333333333,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.000489431816288728,
|
| 38 |
+
"prompt_level_loose_acc": 0.25508317929759705,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.018758491950414142,
|
| 40 |
+
"inst_level_loose_acc": 0.36810551558752996,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005234324323312584
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.21626617375231053,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.01771665442891014,
|
| 46 |
+
"inst_level_strict_acc": 0.3333333333333333,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.000489431816288728,
|
| 48 |
+
"prompt_level_loose_acc": 0.25508317929759705,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.018758491950414142,
|
| 50 |
+
"inst_level_loose_acc": 0.36810551558752996,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005234324323312584
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "7575bde790fe6f08"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "87fc0142e87ea1db"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.635745857,
|
| 8 |
+
"end_time": 18810366.315407366,
|
| 9 |
+
"total_evaluation_time_secondes": "2488.6796615086496",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.26062846580406657,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.018890584986760273,
|
| 36 |
+
"inst_level_strict_acc": 0.3752997601918465,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005276795699129097,
|
| 38 |
+
"prompt_level_loose_acc": 0.28835489833641403,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.019493890350654703,
|
| 40 |
+
"inst_level_loose_acc": 0.4028776978417266,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005295500141012543
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.26062846580406657,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.018890584986760273,
|
| 46 |
+
"inst_level_strict_acc": 0.3752997601918465,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005276795699129097,
|
| 48 |
+
"prompt_level_loose_acc": 0.28835489833641403,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.019493890350654703,
|
| 50 |
+
"inst_level_loose_acc": 0.4028776978417266,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005295500141012543
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "6a3d0652153c9bb7"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "6417b61d11ee28c3"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_20/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/2025-08-28T14-31-58.919046/details_extended|ifeval|0_2025-08-28T14-31-58.919046.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.704473533,
|
| 8 |
+
"end_time": 18809835.16786196,
|
| 9 |
+
"total_evaluation_time_secondes": "1957.463388428092",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.24584103512014788,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.01852941708079558,
|
| 36 |
+
"inst_level_strict_acc": 0.35851318944844124,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005177367706909966,
|
| 38 |
+
"prompt_level_loose_acc": 0.2846580406654344,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.019418769106486003,
|
| 40 |
+
"inst_level_loose_acc": 0.40167865707434053,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005308235859063865
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.24584103512014788,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.01852941708079558,
|
| 46 |
+
"inst_level_strict_acc": 0.35851318944844124,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005177367706909966,
|
| 48 |
+
"prompt_level_loose_acc": 0.2846580406654344,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.019418769106486003,
|
| 50 |
+
"inst_level_loose_acc": 0.40167865707434053,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005308235859063865
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "9e63b6982edb60a3"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "7f0bbce7193e5835"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_25/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/2025-08-28T14-23-36.378741/details_extended|ifeval|0_2025-08-28T14-23-36.378741.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.63665532,
|
| 8 |
+
"end_time": 18809334.473295175,
|
| 9 |
+
"total_evaluation_time_secondes": "1456.836639855057",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.25693160813308685,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.018802962575636847,
|
| 36 |
+
"inst_level_strict_acc": 0.381294964028777,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005088515258209939,
|
| 38 |
+
"prompt_level_loose_acc": 0.29944547134935307,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.019709834029672916,
|
| 40 |
+
"inst_level_loose_acc": 0.4292565947242206,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005359650127318092
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.25693160813308685,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.018802962575636847,
|
| 46 |
+
"inst_level_strict_acc": 0.381294964028777,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005088515258209939,
|
| 48 |
+
"prompt_level_loose_acc": 0.29944547134935307,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.019709834029672916,
|
| 50 |
+
"inst_level_loose_acc": 0.4292565947242206,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005359650127318092
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "300fbc30a20ca56d"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "8ed3bd6852070947"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_30/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/2025-08-28T14-23-16.687943/details_extended|ifeval|0_2025-08-28T14-23-16.687943.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.636172168,
|
| 8 |
+
"end_time": 18809314.862311047,
|
| 9 |
+
"total_evaluation_time_secondes": "1437.2261388786137",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.26247689463955637,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.018933742876044622,
|
| 36 |
+
"inst_level_strict_acc": 0.3980815347721823,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005233210912873605,
|
| 38 |
+
"prompt_level_loose_acc": 0.31053604436229204,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.019912001290591244,
|
| 40 |
+
"inst_level_loose_acc": 0.447242206235012,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.000530725608886124
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.26247689463955637,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.018933742876044622,
|
| 46 |
+
"inst_level_strict_acc": 0.3980815347721823,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005233210912873605,
|
| 48 |
+
"prompt_level_loose_acc": 0.31053604436229204,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.019912001290591244,
|
| 50 |
+
"inst_level_loose_acc": 0.447242206235012,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.000530725608886124
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "8f6719798d9174bd"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "66197f09ecf3792e"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_35/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/2025-08-28T15-19-52.260147/details_extended|ifeval|0_2025-08-28T15-19-52.260147.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18811863.0122587,
|
| 8 |
+
"end_time": 18812710.173764117,
|
| 9 |
+
"total_evaluation_time_secondes": "847.1615054160357",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.24953789279112754,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.018622404509805863,
|
| 36 |
+
"inst_level_strict_acc": 0.37889688249400477,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005113738731364967,
|
| 38 |
+
"prompt_level_loose_acc": 0.3142329020332717,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.01997640206515067,
|
| 40 |
+
"inst_level_loose_acc": 0.4448441247002398,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005297346220956167
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.24953789279112754,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.018622404509805863,
|
| 46 |
+
"inst_level_strict_acc": 0.37889688249400477,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005113738731364967,
|
| 48 |
+
"prompt_level_loose_acc": 0.3142329020332717,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.01997640206515067,
|
| 50 |
+
"inst_level_loose_acc": 0.4448441247002398,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005297346220956167
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "115a2ac2647bf2e6"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "a552e71cdd179caa"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_40/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/2025-08-28T15-18-51.746168/details_extended|ifeval|0_2025-08-28T15-18-51.746168.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18811863.02156565,
|
| 8 |
+
"end_time": 18812650.77936895,
|
| 9 |
+
"total_evaluation_time_secondes": "787.757803298533",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.2735674676524954,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.019183727107392825,
|
| 36 |
+
"inst_level_strict_acc": 0.41247002398081534,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.000532554469072579,
|
| 38 |
+
"prompt_level_loose_acc": 0.34750462107208874,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.02049142365341572,
|
| 40 |
+
"inst_level_loose_acc": 0.4832134292565947,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005287169246309893
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.2735674676524954,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.019183727107392825,
|
| 46 |
+
"inst_level_strict_acc": 0.41247002398081534,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.000532554469072579,
|
| 48 |
+
"prompt_level_loose_acc": 0.34750462107208874,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.02049142365341572,
|
| 50 |
+
"inst_level_loose_acc": 0.4832134292565947,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005287169246309893
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "88ff4ce1dc04a1a1"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "2f831f811e8d0b88"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_45/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/2025-08-28T15-19-48.255403/details_extended|ifeval|0_2025-08-28T15-19-48.255403.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18811863.018329486,
|
| 8 |
+
"end_time": 18812707.18770021,
|
| 9 |
+
"total_evaluation_time_secondes": "844.1693707220256",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.2698706099815157,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.019102087526494303,
|
| 36 |
+
"inst_level_strict_acc": 0.407673860911271,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005080955062228919,
|
| 38 |
+
"prompt_level_loose_acc": 0.3456561922365989,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.02046577943268255,
|
| 40 |
+
"inst_level_loose_acc": 0.48081534772182255,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005250291013858589
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.2698706099815157,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.019102087526494303,
|
| 46 |
+
"inst_level_strict_acc": 0.407673860911271,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005080955062228919,
|
| 48 |
+
"prompt_level_loose_acc": 0.3456561922365989,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.02046577943268255,
|
| 50 |
+
"inst_level_loose_acc": 0.48081534772182255,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005250291013858589
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "c81b0908f6d01a01"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "11c1fdf8fbfac4f1"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18807877.63640212,
|
| 8 |
+
"end_time": 18811337.61977247,
|
| 9 |
+
"total_evaluation_time_secondes": "3459.9833703525364",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.17744916820702403,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.016440744379699793,
|
| 36 |
+
"inst_level_strict_acc": 0.30935251798561153,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0004545789816041833,
|
| 38 |
+
"prompt_level_loose_acc": 0.19408502772643252,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.0170193805507494,
|
| 40 |
+
"inst_level_loose_acc": 0.3225419664268585,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0004593128188518521
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.17744916820702403,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.016440744379699793,
|
| 46 |
+
"inst_level_strict_acc": 0.30935251798561153,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0004545789816041833,
|
| 48 |
+
"prompt_level_loose_acc": 0.19408502772643252,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.0170193805507494,
|
| 50 |
+
"inst_level_loose_acc": 0.3225419664268585,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0004593128188518521
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "cb86db8b46b3063a"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "1ff5cba463b1840d"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/global_step_50/details/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/2025-08-28T15-18-25.365193/details_extended|ifeval|0_2025-08-28T15-18-25.365193.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config_general": {
|
| 3 |
+
"lighteval_sha": "327071fe86e427d880f907a51d1462f4a3f951c1",
|
| 4 |
+
"num_fewshot_seeds": 1,
|
| 5 |
+
"max_samples": null,
|
| 6 |
+
"job_id": 0,
|
| 7 |
+
"start_time": 18811863.116520293,
|
| 8 |
+
"end_time": 18812624.911301415,
|
| 9 |
+
"total_evaluation_time_secondes": "761.7947811223567",
|
| 10 |
+
"model_name": "_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface",
|
| 11 |
+
"model_sha": "",
|
| 12 |
+
"model_dtype": null,
|
| 13 |
+
"model_size": null,
|
| 14 |
+
"generation_parameters": {
|
| 15 |
+
"early_stopping": null,
|
| 16 |
+
"repetition_penalty": null,
|
| 17 |
+
"frequency_penalty": null,
|
| 18 |
+
"length_penalty": null,
|
| 19 |
+
"presence_penalty": null,
|
| 20 |
+
"max_new_tokens": 32768,
|
| 21 |
+
"min_new_tokens": null,
|
| 22 |
+
"seed": null,
|
| 23 |
+
"stop_tokens": null,
|
| 24 |
+
"temperature": 1.0,
|
| 25 |
+
"top_k": null,
|
| 26 |
+
"min_p": null,
|
| 27 |
+
"top_p": 0.95,
|
| 28 |
+
"truncate_prompt": null,
|
| 29 |
+
"response_format": null
|
| 30 |
+
}
|
| 31 |
+
},
|
| 32 |
+
"results": {
|
| 33 |
+
"extended|ifeval|0": {
|
| 34 |
+
"prompt_level_strict_acc": 0.2587800369685767,
|
| 35 |
+
"prompt_level_strict_acc_stderr": 0.018846992560712535,
|
| 36 |
+
"inst_level_strict_acc": 0.38848920863309355,
|
| 37 |
+
"inst_level_strict_acc_stderr": 0.0005002404333580301,
|
| 38 |
+
"prompt_level_loose_acc": 0.3419593345656192,
|
| 39 |
+
"prompt_level_loose_acc_stderr": 0.020413464513923615,
|
| 40 |
+
"inst_level_loose_acc": 0.4676258992805755,
|
| 41 |
+
"inst_level_loose_acc_stderr": 0.0005162613395928805
|
| 42 |
+
},
|
| 43 |
+
"all": {
|
| 44 |
+
"prompt_level_strict_acc": 0.2587800369685767,
|
| 45 |
+
"prompt_level_strict_acc_stderr": 0.018846992560712535,
|
| 46 |
+
"inst_level_strict_acc": 0.38848920863309355,
|
| 47 |
+
"inst_level_strict_acc_stderr": 0.0005002404333580301,
|
| 48 |
+
"prompt_level_loose_acc": 0.3419593345656192,
|
| 49 |
+
"prompt_level_loose_acc_stderr": 0.020413464513923615,
|
| 50 |
+
"inst_level_loose_acc": 0.4676258992805755,
|
| 51 |
+
"inst_level_loose_acc_stderr": 0.0005162613395928805
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"versions": {
|
| 55 |
+
"extended|ifeval|0": "0.1"
|
| 56 |
+
},
|
| 57 |
+
"config_tasks": {
|
| 58 |
+
"extended|ifeval": {
|
| 59 |
+
"name": "ifeval",
|
| 60 |
+
"prompt_function": "ifeval_prompt",
|
| 61 |
+
"hf_repo": "google/IFEval",
|
| 62 |
+
"hf_subset": "default",
|
| 63 |
+
"metric": [
|
| 64 |
+
{
|
| 65 |
+
"metric_name": [
|
| 66 |
+
"prompt_level_strict_acc",
|
| 67 |
+
"inst_level_strict_acc",
|
| 68 |
+
"prompt_level_loose_acc",
|
| 69 |
+
"inst_level_loose_acc"
|
| 70 |
+
],
|
| 71 |
+
"higher_is_better": {
|
| 72 |
+
"prompt_level_strict_acc": true,
|
| 73 |
+
"inst_level_strict_acc": true,
|
| 74 |
+
"prompt_level_loose_acc": true,
|
| 75 |
+
"inst_level_loose_acc": true
|
| 76 |
+
},
|
| 77 |
+
"category": "3",
|
| 78 |
+
"use_case": "1",
|
| 79 |
+
"sample_level_fn": "ifeval_metric",
|
| 80 |
+
"corpus_level_fn": {
|
| 81 |
+
"prompt_level_strict_acc": "mean",
|
| 82 |
+
"inst_level_strict_acc": "agg_inst_level_acc",
|
| 83 |
+
"prompt_level_loose_acc": "mean",
|
| 84 |
+
"inst_level_loose_acc": "agg_inst_level_acc"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
"hf_revision": null,
|
| 89 |
+
"hf_filter": null,
|
| 90 |
+
"hf_avail_splits": [
|
| 91 |
+
"train"
|
| 92 |
+
],
|
| 93 |
+
"trust_dataset": false,
|
| 94 |
+
"evaluation_splits": [
|
| 95 |
+
"train"
|
| 96 |
+
],
|
| 97 |
+
"few_shots_split": "train",
|
| 98 |
+
"few_shots_select": "random_sampling",
|
| 99 |
+
"generation_size": 1280,
|
| 100 |
+
"generation_grammar": null,
|
| 101 |
+
"stop_sequence": [],
|
| 102 |
+
"num_samples": null,
|
| 103 |
+
"suite": [
|
| 104 |
+
"extended"
|
| 105 |
+
],
|
| 106 |
+
"original_num_docs": 541,
|
| 107 |
+
"effective_num_docs": 541,
|
| 108 |
+
"must_remove_duplicate_docs": false,
|
| 109 |
+
"version": "0.1"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"summary_tasks": {
|
| 113 |
+
"extended|ifeval|0": {
|
| 114 |
+
"hashes": {
|
| 115 |
+
"hash_examples": "e99cbf567588d7c6",
|
| 116 |
+
"hash_full_prompts": "7ea7bf2a8edba8f4",
|
| 117 |
+
"hash_input_tokens": "e3d19e04074f1062",
|
| 118 |
+
"hash_cont_tokens": "639f8c16b4bbe76d"
|
| 119 |
+
},
|
| 120 |
+
"truncated": 0,
|
| 121 |
+
"non_truncated": 541,
|
| 122 |
+
"padded": 0,
|
| 123 |
+
"non_padded": 541,
|
| 124 |
+
"effective_few_shots": 0.0,
|
| 125 |
+
"num_truncated_few_shots": 0
|
| 126 |
+
}
|
| 127 |
+
},
|
| 128 |
+
"summary_general": {
|
| 129 |
+
"hashes": {
|
| 130 |
+
"hash_examples": "ea046ab2c6fc5928",
|
| 131 |
+
"hash_full_prompts": "45f8422f6ad2da79",
|
| 132 |
+
"hash_input_tokens": "32d769c21a57d2c7",
|
| 133 |
+
"hash_cont_tokens": "602d2eb1bbdd7950"
|
| 134 |
+
},
|
| 135 |
+
"truncated": 0,
|
| 136 |
+
"non_truncated": 541,
|
| 137 |
+
"padded": 0,
|
| 138 |
+
"non_padded": 541,
|
| 139 |
+
"num_truncated_few_shots": 0
|
| 140 |
+
}
|
| 141 |
+
}
|
eval_results_ood/results.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
exp_name,global_step,model_name,json_relpath,prompt_level_strict_acc,prompt_level_strict_acc_stderr,inst_level_strict_acc,inst_level_strict_acc_stderr,prompt_level_loose_acc,prompt_level_loose_acc_stderr,inst_level_loose_acc,inst_level_loose_acc_stderr,gpqa_pass@1:1_samples,gpqa_pass@1:1_samples_stderr
|
| 2 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,0,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_0/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_0_actor_huggingface/results_2025-08-28T15-04-01.057590.json,0.2088724584103512,0.017493107347793312,0.3249400479616307,0.0005020381027360004,0.22181146025878004,0.01787876540794439,0.34652278177458035,0.0005100828283216712,,
|
| 3 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,5,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_5/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_5_actor_huggingface/results_2025-08-28T14-57-06.728642.json,0.17744916820702403,0.016440744379699793,0.30935251798561153,0.0004545789816041833,0.19408502772643252,0.0170193805507494,0.3225419664268585,0.0004593128188518521,,
|
| 4 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,10,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_10/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_10_actor_huggingface/results_2025-08-28T14-43-43.673554.json,0.21626617375231053,0.01771665442891014,0.3333333333333333,0.000489431816288728,0.25508317929759705,0.018758491950414142,0.36810551558752996,0.0005234324323312584,,
|
| 5 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,15,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_15/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_15_actor_huggingface/results_2025-08-28T14-40-51.255236.json,0.26062846580406657,0.018890584986760273,0.3752997601918465,0.0005276795699129097,0.28835489833641403,0.019493890350654703,0.4028776978417266,0.0005295500141012543,,
|
| 6 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,20,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_20/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_20_actor_huggingface/results_2025-08-28T14-31-58.919046.json,0.24584103512014788,0.01852941708079558,0.35851318944844124,0.0005177367706909966,0.2846580406654344,0.019418769106486003,0.40167865707434053,0.0005308235859063865,,
|
| 7 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,25,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_25/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_25_actor_huggingface/results_2025-08-28T14-23-36.378741.json,0.25693160813308685,0.018802962575636847,0.381294964028777,0.0005088515258209939,0.29944547134935307,0.019709834029672916,0.4292565947242206,0.0005359650127318092,,
|
| 8 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,30,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_30/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_30_actor_huggingface/results_2025-08-28T14-23-16.687943.json,0.26247689463955637,0.018933742876044622,0.3980815347721823,0.0005233210912873605,0.31053604436229204,0.019912001290591244,0.447242206235012,0.000530725608886124,,
|
| 9 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,35,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_35/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_35_actor_huggingface/results_2025-08-28T15-19-52.260147.json,0.24953789279112754,0.018622404509805863,0.37889688249400477,0.0005113738731364967,0.3142329020332717,0.01997640206515067,0.4448441247002398,0.0005297346220956167,,
|
| 10 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,40,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_40/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_40_actor_huggingface/results_2025-08-28T15-18-51.746168.json,0.2735674676524954,0.019183727107392825,0.41247002398081534,0.000532554469072579,0.34750462107208874,0.02049142365341572,0.4832134292565947,0.0005287169246309893,,
|
| 11 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,45,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_45/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_45_actor_huggingface/results_2025-08-28T15-19-48.255403.json,0.2698706099815157,0.019102087526494303,0.407673860911271,0.0005080955062228919,0.3456561922365989,0.02046577943268255,0.48081534772182255,0.0005250291013858589,,
|
| 12 |
+
verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15,50,_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface,verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15/eval_results_ood/global_step_50/results/_home_work_compass_innovation_minzijun_checkpoints_verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15_global_step_50_actor_huggingface/results_2025-08-28T15-18-25.365193.json,0.2587800369685767,0.018846992560712535,0.38848920863309355,0.0005002404333580301,0.3419593345656192,0.020413464513923615,0.4676258992805755,0.0005162613395928805,,
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.eval_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_10--actor--huggingface_vllm_temp_1.0.raw.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.eval_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_15--actor--huggingface_vllm_temp_1.0.raw.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.eval_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_20--actor--huggingface_vllm_temp_1.0.raw.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.eval_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evalplus_results/mbpp/home--work--compass_innovation--minzijun--checkpoints--verl_role_sft_dapo_deepmath_train_sample_6144_context_4k_Qwen3-1.7B-base_max_response4096_batch1024_rollout8_vllm_True_bias0.15--global_step_25--actor--huggingface_vllm_temp_1.0.raw.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|