alenphilip commited on
Commit
5d6c820
·
verified ·
1 Parent(s): e6f063b

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dab8a7a90be4132c7e6c7e6c6466a52e13d845dace74f4585f13ce0d4447aa53
3
  size 201880976
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeda7b371ff2e1752bf1aff362fa660259c343ff41adf1ebf4a35769f07ce5e5
3
  size 201880976
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b0f884c0fdc51885e527451a10871636e5f43cc1aade97f045d5afd3a0709d8
3
  size 102771467
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b4cbbc7c7e47ac572d0611695777730d7795b30ce4422d923e37f4c43b2d15
3
  size 102771467
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:421676f97cd124780e65268d7dc0a07293d3d73d0daa32a18560e251ed29e808
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2f6fd8a366989100bcb570e1fd69da9deb6a29ce5bba1d2c8889118062705c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f71739903ba898d44abdc409bd0e9f2dcc946caab0fe7ef602f12d023f02c330
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63ca6d6866d748b90a4b2173e0ca24db709af27b45b8531207b094cb85539103
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 50,
3
- "best_metric": 0.74902075023143,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.4678362573099415,
6
  "eval_steps": 50,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -134,6 +134,132 @@
134
  "eval_samples_per_second": 27.224,
135
  "eval_steps_per_second": 1.703,
136
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  }
138
  ],
139
  "logging_steps": 10,
@@ -153,7 +279,7 @@
153
  "attributes": {}
154
  }
155
  },
156
- "total_flos": 2.130999126944809e+17,
157
  "train_batch_size": 16,
158
  "trial_name": null,
159
  "trial_params": null
 
1
  {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.7492690359164101,
4
+ "best_model_checkpoint": "./qwen2.5-7b-sft-qlora/checkpoint-200",
5
+ "epoch": 0.935672514619883,
6
  "eval_steps": 50,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
134
  "eval_samples_per_second": 27.224,
135
  "eval_steps_per_second": 1.703,
136
  "step": 100
137
+ },
138
+ {
139
+ "entropy": 0.6501711800694465,
140
+ "epoch": 0.5146198830409356,
141
+ "grad_norm": 0.1625615507364273,
142
+ "learning_rate": 0.00018584487936018661,
143
+ "loss": 0.6484,
144
+ "mean_token_accuracy": 0.8180312633514404,
145
+ "num_tokens": 2659238.0,
146
+ "step": 110
147
+ },
148
+ {
149
+ "entropy": 0.6405581876635551,
150
+ "epoch": 0.5614035087719298,
151
+ "grad_norm": 0.17417997121810913,
152
+ "learning_rate": 0.00018137863234250347,
153
+ "loss": 0.6404,
154
+ "mean_token_accuracy": 0.819054339826107,
155
+ "num_tokens": 2897816.0,
156
+ "step": 120
157
+ },
158
+ {
159
+ "entropy": 0.6380819544196129,
160
+ "epoch": 0.6081871345029239,
161
+ "grad_norm": 0.17349691689014435,
162
+ "learning_rate": 0.00017637082395311024,
163
+ "loss": 0.6366,
164
+ "mean_token_accuracy": 0.820624266564846,
165
+ "num_tokens": 3136294.0,
166
+ "step": 130
167
+ },
168
+ {
169
+ "entropy": 0.6500405013561249,
170
+ "epoch": 0.6549707602339181,
171
+ "grad_norm": 0.18412715196609497,
172
+ "learning_rate": 0.00017085478033060806,
173
+ "loss": 0.6426,
174
+ "mean_token_accuracy": 0.8185427248477936,
175
+ "num_tokens": 3375202.0,
176
+ "step": 140
177
+ },
178
+ {
179
+ "entropy": 0.6269903033971786,
180
+ "epoch": 0.7017543859649122,
181
+ "grad_norm": 0.1778886765241623,
182
+ "learning_rate": 0.00016486720983522156,
183
+ "loss": 0.6279,
184
+ "mean_token_accuracy": 0.8219256103038788,
185
+ "num_tokens": 3614721.0,
186
+ "step": 150
187
+ },
188
+ {
189
+ "epoch": 0.7017543859649122,
190
+ "eval_bleu": 61.15829556167586,
191
+ "eval_entropy": 0.5959388177703928,
192
+ "eval_loss": 0.6073054671287537,
193
+ "eval_mean_token_accuracy": 0.8267559442255232,
194
+ "eval_num_tokens": 3614721.0,
195
+ "eval_rougeL": 0.7485533859740823,
196
+ "eval_runtime": 63.4672,
197
+ "eval_samples_per_second": 27.195,
198
+ "eval_steps_per_second": 1.702,
199
+ "step": 150
200
+ },
201
+ {
202
+ "entropy": 0.6273025006055832,
203
+ "epoch": 0.7485380116959064,
204
+ "grad_norm": 0.17554914951324463,
205
+ "learning_rate": 0.000158447958760718,
206
+ "loss": 0.6235,
207
+ "mean_token_accuracy": 0.8232012897729873,
208
+ "num_tokens": 3852615.0,
209
+ "step": 160
210
+ },
211
+ {
212
+ "entropy": 0.6264464437961579,
213
+ "epoch": 0.7953216374269005,
214
+ "grad_norm": 0.17685498297214508,
215
+ "learning_rate": 0.0001516397461638962,
216
+ "loss": 0.6223,
217
+ "mean_token_accuracy": 0.8228656515479088,
218
+ "num_tokens": 4085589.0,
219
+ "step": 170
220
+ },
221
+ {
222
+ "entropy": 0.623998960852623,
223
+ "epoch": 0.8421052631578947,
224
+ "grad_norm": 0.1789834052324295,
225
+ "learning_rate": 0.0001444878795763121,
226
+ "loss": 0.6191,
227
+ "mean_token_accuracy": 0.8224357396364212,
228
+ "num_tokens": 4327626.0,
229
+ "step": 180
230
+ },
231
+ {
232
+ "entropy": 0.6093558698892594,
233
+ "epoch": 0.8888888888888888,
234
+ "grad_norm": 0.17523610591888428,
235
+ "learning_rate": 0.00013703995349013113,
236
+ "loss": 0.61,
237
+ "mean_token_accuracy": 0.8264237254858017,
238
+ "num_tokens": 4570278.0,
239
+ "step": 190
240
+ },
241
+ {
242
+ "entropy": 0.6039168611168861,
243
+ "epoch": 0.935672514619883,
244
+ "grad_norm": 0.18692275881767273,
245
+ "learning_rate": 0.00012934553262463548,
246
+ "loss": 0.6032,
247
+ "mean_token_accuracy": 0.828160648047924,
248
+ "num_tokens": 4806172.0,
249
+ "step": 200
250
+ },
251
+ {
252
+ "epoch": 0.935672514619883,
253
+ "eval_bleu": 60.260312927941236,
254
+ "eval_entropy": 0.5826076859677279,
255
+ "eval_loss": 0.6021928787231445,
256
+ "eval_mean_token_accuracy": 0.8273030961001361,
257
+ "eval_num_tokens": 4806172.0,
258
+ "eval_rougeL": 0.7492690359164101,
259
+ "eval_runtime": 63.3853,
260
+ "eval_samples_per_second": 27.23,
261
+ "eval_steps_per_second": 1.704,
262
+ "step": 200
263
  }
264
  ],
265
  "logging_steps": 10,
 
279
  "attributes": {}
280
  }
281
  },
282
+ "total_flos": 4.157882340289413e+17,
283
  "train_batch_size": 16,
284
  "trial_name": null,
285
  "trial_params": null