radna commited on
Commit
c75c2b6
·
verified ·
1 Parent(s): 3b99bba

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. checkpoint-10/adapter_config.json +2 -2
  2. checkpoint-10/trainer_state.json +16 -15
  3. checkpoint-10/training_args.bin +1 -1
  4. checkpoint-12/adapter_config.json +2 -2
  5. checkpoint-12/trainer_state.json +19 -18
  6. checkpoint-12/training_args.bin +1 -1
  7. checkpoint-14/adapter_config.json +2 -2
  8. checkpoint-14/trainer_state.json +21 -20
  9. checkpoint-14/training_args.bin +1 -1
  10. checkpoint-16/adapter_config.json +2 -2
  11. checkpoint-16/trainer_state.json +23 -22
  12. checkpoint-16/training_args.bin +1 -1
  13. checkpoint-18/adapter_config.json +2 -2
  14. checkpoint-18/trainer_state.json +26 -25
  15. checkpoint-18/training_args.bin +1 -1
  16. checkpoint-2/adapter_config.json +2 -2
  17. checkpoint-2/trainer_state.json +6 -5
  18. checkpoint-2/training_args.bin +1 -1
  19. checkpoint-20/adapter_config.json +2 -2
  20. checkpoint-20/trainer_state.json +28 -27
  21. checkpoint-20/training_args.bin +1 -1
  22. checkpoint-22/adapter_config.json +2 -2
  23. checkpoint-22/trainer_state.json +31 -30
  24. checkpoint-22/training_args.bin +1 -1
  25. checkpoint-24/adapter_config.json +2 -2
  26. checkpoint-24/trainer_state.json +34 -33
  27. checkpoint-24/training_args.bin +1 -1
  28. checkpoint-26/adapter_config.json +2 -2
  29. checkpoint-26/trainer_state.json +36 -35
  30. checkpoint-26/training_args.bin +1 -1
  31. checkpoint-28/adapter_config.json +2 -2
  32. checkpoint-28/trainer_state.json +38 -37
  33. checkpoint-28/training_args.bin +1 -1
  34. checkpoint-4/adapter_config.json +2 -2
  35. checkpoint-4/trainer_state.json +8 -7
  36. checkpoint-4/training_args.bin +1 -1
  37. checkpoint-6/adapter_config.json +2 -2
  38. checkpoint-6/trainer_state.json +12 -11
  39. checkpoint-6/training_args.bin +1 -1
  40. checkpoint-8/adapter_config.json +2 -2
  41. checkpoint-8/trainer_state.json +14 -13
  42. checkpoint-8/training_args.bin +1 -1
checkpoint-10/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-10/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.012996690347790718,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
4
  "epoch": 2.4210526315789473,
@@ -6,7 +7,7 @@
6
  "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  }
 
1
  {
2
+ "best_global_step": 6,
3
  "best_metric": 0.012996690347790718,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
5
  "epoch": 2.4210526315789473,
 
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  }
checkpoint-10/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-12/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-12/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 2.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 12,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 2.8421052631578947,
 
7
  "global_step": 12,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
checkpoint-12/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-14/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-14/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 3.4210526315789473,
@@ -6,7 +7,7 @@
6
  "global_step": 14,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  }
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 3.4210526315789473,
 
7
  "global_step": 14,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  }
checkpoint-14/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-16/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-16/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 3.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 16,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  }
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 3.8421052631578947,
 
7
  "global_step": 16,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  }
checkpoint-16/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-18/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-18/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 4.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 4.421052631578947,
 
7
  "global_step": 18,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
checkpoint-18/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-2/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-2/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.42105263157894735,
@@ -6,7 +7,7 @@
6
  "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  }
39
  ],
40
  "logging_steps": 1,
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 0.42105263157894735,
 
7
  "global_step": 2,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  }
40
  ],
41
  "logging_steps": 1,
checkpoint-2/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-20/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-20/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 4.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  }
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 4.842105263157895,
 
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 187.02,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 187.02,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  }
checkpoint-20/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-22/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-22/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 5.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 22,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  }
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 5.421052631578947,
 
7
  "global_step": 22,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 187.02,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 187.02,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 187.02,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 187.02,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  }
checkpoint-22/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-24/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-24/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 5.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 24,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 5.842105263157895,
 
7
  "global_step": 24,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 187.02,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 187.02,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 187.02,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 187.02,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 187.02,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 187.02,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0642,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
checkpoint-24/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-26/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-26/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 6.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 26,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
@@ -408,7 +409,7 @@
408
  "kl": 0.1800537109375,
409
  "learning_rate": 7.243995901002312e-05,
410
  "loss": -0.02097315341234207,
411
- "memory(GiB)": 182.91,
412
  "response_clip_ratio": 0.6171875,
413
  "reward": 0.03010205877944827,
414
  "reward_std": 0.10742511600255966,
@@ -424,7 +425,7 @@
424
  "kl": 0.18408203125,
425
  "learning_rate": 6.980398830195785e-05,
426
  "loss": -0.02103913575410843,
427
- "memory(GiB)": 182.91,
428
  "step": 26,
429
  "train_speed(iter/s)": 0.000421
430
  }
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 6.421052631578947,
 
7
  "global_step": 26,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 187.02,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 187.02,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 187.02,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 187.02,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 187.02,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 187.02,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0642,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
 
409
  "kl": 0.1800537109375,
410
  "learning_rate": 7.243995901002312e-05,
411
  "loss": -0.02097315341234207,
412
+ "memory(GiB)": 187.02,
413
  "response_clip_ratio": 0.6171875,
414
  "reward": 0.03010205877944827,
415
  "reward_std": 0.10742511600255966,
 
425
  "kl": 0.18408203125,
426
  "learning_rate": 6.980398830195785e-05,
427
  "loss": -0.02103913575410843,
428
+ "memory(GiB)": 187.02,
429
  "step": 26,
430
  "train_speed(iter/s)": 0.000421
431
  }
checkpoint-26/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-28/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-28/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 6.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 28,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
@@ -408,7 +409,7 @@
408
  "kl": 0.1800537109375,
409
  "learning_rate": 7.243995901002312e-05,
410
  "loss": -0.02097315341234207,
411
- "memory(GiB)": 182.91,
412
  "response_clip_ratio": 0.6171875,
413
  "reward": 0.03010205877944827,
414
  "reward_std": 0.10742511600255966,
@@ -424,7 +425,7 @@
424
  "kl": 0.18408203125,
425
  "learning_rate": 6.980398830195785e-05,
426
  "loss": -0.02103913575410843,
427
- "memory(GiB)": 182.91,
428
  "step": 26,
429
  "train_speed(iter/s)": 0.000421
430
  },
@@ -436,7 +437,7 @@
436
  "kl": 0.174560546875,
437
  "learning_rate": 6.710100716628344e-05,
438
  "loss": -0.03593946248292923,
439
- "memory(GiB)": 182.91,
440
  "response_clip_ratio": 0.513671875,
441
  "reward": 0.04752760287374258,
442
  "reward_std": 0.14935147762298584,
@@ -452,7 +453,7 @@
452
  "kl": 0.182373046875,
453
  "learning_rate": 6.434016163555452e-05,
454
  "loss": -0.03595500811934471,
455
- "memory(GiB)": 182.91,
456
  "step": 28,
457
  "train_speed(iter/s)": 0.000422
458
  }
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 6.842105263157895,
 
7
  "global_step": 28,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 176.98,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 176.98,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 187.02,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 187.02,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9048,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 187.02,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 187.02,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 187.02,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 187.02,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 187.02,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 187.02,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 187.02,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 187.02,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 187.02,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 187.02,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 187.02,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 187.02,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0642,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
 
409
  "kl": 0.1800537109375,
410
  "learning_rate": 7.243995901002312e-05,
411
  "loss": -0.02097315341234207,
412
+ "memory(GiB)": 187.02,
413
  "response_clip_ratio": 0.6171875,
414
  "reward": 0.03010205877944827,
415
  "reward_std": 0.10742511600255966,
 
425
  "kl": 0.18408203125,
426
  "learning_rate": 6.980398830195785e-05,
427
  "loss": -0.02103913575410843,
428
+ "memory(GiB)": 187.02,
429
  "step": 26,
430
  "train_speed(iter/s)": 0.000421
431
  },
 
437
  "kl": 0.174560546875,
438
  "learning_rate": 6.710100716628344e-05,
439
  "loss": -0.03593946248292923,
440
+ "memory(GiB)": 187.02,
441
  "response_clip_ratio": 0.513671875,
442
  "reward": 0.04752760287374258,
443
  "reward_std": 0.14935147762298584,
 
453
  "kl": 0.182373046875,
454
  "learning_rate": 6.434016163555452e-05,
455
  "loss": -0.03595500811934471,
456
+ "memory(GiB)": 187.02,
457
  "step": 28,
458
  "train_speed(iter/s)": 0.000422
459
  }
checkpoint-28/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-4/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-4/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 4,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  }
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 0.8421052631578947,
 
7
  "global_step": 4,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  }
checkpoint-4/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-6/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-6/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.012996690347790718,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
4
  "epoch": 1.4210526315789473,
@@ -6,7 +7,7 @@
6
  "global_step": 6,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
 
1
  {
2
+ "best_global_step": 6,
3
  "best_metric": 0.012996690347790718,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
5
  "epoch": 1.4210526315789473,
 
7
  "global_step": 6,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
checkpoint-6/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809
checkpoint-8/adapter_config.json CHANGED
@@ -24,10 +24,10 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
- "gate_proj",
28
  "k_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
  "down_proj"
33
  ],
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "up_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "q_proj",
32
  "down_proj"
33
  ],
checkpoint-8/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.012996690347790718,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
4
  "epoch": 1.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 8,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  }
 
1
  {
2
+ "best_global_step": 6,
3
  "best_metric": 0.012996690347790718,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
5
  "epoch": 1.8421052631578947,
 
7
  "global_step": 8,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 176.98,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 176.98,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 176.98,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 176.98,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 176.98,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 176.98,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1126,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 176.98,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 176.98,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  }
checkpoint-8/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cdf21dfd9faa218b7fd99e3f3dc0ef681c4e3fd3b905e7348f5467b0198044
3
  size 9809