3N3G commited on
Commit
66f5bed
·
verified ·
1 Parent(s): 712e8ef

Training in progress, step 48, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e7da2c9aef8f35f6786cbf06af70258ed692543ecd8515c205ebddc810fd910
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c38008e3916fec8e8f62afa5a3bd98fe7c5d26d66abc6e96d2d4af4fff2c184
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16af573dbb77b92352dc7d86e3ffabc1ab8f05af70d970bb7737f8c187b8c429
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9482d36f512837bb053f4653e6c9613c71d822c8b455d0f012e909689a04544a
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.0,
6
  "eval_steps": 16,
7
- "global_step": 16,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -128,6 +128,246 @@
128
  "eval_samples_per_second": 18.005,
129
  "eval_steps_per_second": 18.005,
130
  "step": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "logging_steps": 1,
@@ -147,7 +387,7 @@
147
  "attributes": {}
148
  }
149
  },
150
- "total_flos": 5366131440353280.0,
151
  "train_batch_size": 1,
152
  "trial_name": null,
153
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 12.0,
6
  "eval_steps": 16,
7
+ "global_step": 48,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
128
  "eval_samples_per_second": 18.005,
129
  "eval_steps_per_second": 18.005,
130
  "step": 16
131
+ },
132
+ {
133
+ "epoch": 4.290909090909091,
134
+ "grad_norm": 9.95614242553711,
135
+ "learning_rate": 4e-08,
136
+ "loss": 0.7785,
137
+ "step": 17
138
+ },
139
+ {
140
+ "epoch": 4.581818181818182,
141
+ "grad_norm": 11.356291770935059,
142
+ "learning_rate": 4.25e-08,
143
+ "loss": 0.8645,
144
+ "step": 18
145
+ },
146
+ {
147
+ "epoch": 4.872727272727273,
148
+ "grad_norm": 10.108142852783203,
149
+ "learning_rate": 4.5e-08,
150
+ "loss": 0.7834,
151
+ "step": 19
152
+ },
153
+ {
154
+ "epoch": 5.0,
155
+ "grad_norm": 10.209877014160156,
156
+ "learning_rate": 4.7499999999999995e-08,
157
+ "loss": 0.7744,
158
+ "step": 20
159
+ },
160
+ {
161
+ "epoch": 5.290909090909091,
162
+ "grad_norm": 9.586356163024902,
163
+ "learning_rate": 5e-08,
164
+ "loss": 0.7433,
165
+ "step": 21
166
+ },
167
+ {
168
+ "epoch": 5.581818181818182,
169
+ "grad_norm": 10.589778900146484,
170
+ "learning_rate": 5.25e-08,
171
+ "loss": 0.818,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 5.872727272727273,
176
+ "grad_norm": 10.28813362121582,
177
+ "learning_rate": 5.5e-08,
178
+ "loss": 0.8245,
179
+ "step": 23
180
+ },
181
+ {
182
+ "epoch": 6.0,
183
+ "grad_norm": 13.027183532714844,
184
+ "learning_rate": 5.749999999999999e-08,
185
+ "loss": 0.8331,
186
+ "step": 24
187
+ },
188
+ {
189
+ "epoch": 6.290909090909091,
190
+ "grad_norm": 10.363873481750488,
191
+ "learning_rate": 6e-08,
192
+ "loss": 0.8331,
193
+ "step": 25
194
+ },
195
+ {
196
+ "epoch": 6.581818181818182,
197
+ "grad_norm": 9.84264850616455,
198
+ "learning_rate": 6.25e-08,
199
+ "loss": 0.755,
200
+ "step": 26
201
+ },
202
+ {
203
+ "epoch": 6.872727272727273,
204
+ "grad_norm": 10.973934173583984,
205
+ "learning_rate": 6.5e-08,
206
+ "loss": 0.8372,
207
+ "step": 27
208
+ },
209
+ {
210
+ "epoch": 7.0,
211
+ "grad_norm": 10.278410911560059,
212
+ "learning_rate": 6.75e-08,
213
+ "loss": 0.7442,
214
+ "step": 28
215
+ },
216
+ {
217
+ "epoch": 7.290909090909091,
218
+ "grad_norm": 10.205405235290527,
219
+ "learning_rate": 6.999999999999999e-08,
220
+ "loss": 0.7851,
221
+ "step": 29
222
+ },
223
+ {
224
+ "epoch": 7.581818181818182,
225
+ "grad_norm": 10.862798690795898,
226
+ "learning_rate": 7.25e-08,
227
+ "loss": 0.7962,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 7.872727272727273,
232
+ "grad_norm": 9.971634864807129,
233
+ "learning_rate": 7.5e-08,
234
+ "loss": 0.79,
235
+ "step": 31
236
+ },
237
+ {
238
+ "epoch": 8.0,
239
+ "grad_norm": 10.8460111618042,
240
+ "learning_rate": 7.75e-08,
241
+ "loss": 0.9105,
242
+ "step": 32
243
+ },
244
+ {
245
+ "epoch": 8.0,
246
+ "eval_loss": 0.7589532136917114,
247
+ "eval_runtime": 0.7554,
248
+ "eval_samples_per_second": 17.21,
249
+ "eval_steps_per_second": 17.21,
250
+ "step": 32
251
+ },
252
+ {
253
+ "epoch": 8.290909090909091,
254
+ "grad_norm": 10.4276704788208,
255
+ "learning_rate": 8e-08,
256
+ "loss": 0.7914,
257
+ "step": 33
258
+ },
259
+ {
260
+ "epoch": 8.581818181818182,
261
+ "grad_norm": 9.807103157043457,
262
+ "learning_rate": 8.249999999999999e-08,
263
+ "loss": 0.8344,
264
+ "step": 34
265
+ },
266
+ {
267
+ "epoch": 8.872727272727273,
268
+ "grad_norm": 9.850166320800781,
269
+ "learning_rate": 8.5e-08,
270
+ "loss": 0.7419,
271
+ "step": 35
272
+ },
273
+ {
274
+ "epoch": 9.0,
275
+ "grad_norm": 12.547399520874023,
276
+ "learning_rate": 8.75e-08,
277
+ "loss": 0.8597,
278
+ "step": 36
279
+ },
280
+ {
281
+ "epoch": 9.290909090909091,
282
+ "grad_norm": 10.39106559753418,
283
+ "learning_rate": 9e-08,
284
+ "loss": 0.7911,
285
+ "step": 37
286
+ },
287
+ {
288
+ "epoch": 9.581818181818182,
289
+ "grad_norm": 10.728227615356445,
290
+ "learning_rate": 9.25e-08,
291
+ "loss": 0.852,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 9.872727272727273,
296
+ "grad_norm": 10.104507446289062,
297
+ "learning_rate": 9.499999999999999e-08,
298
+ "loss": 0.7942,
299
+ "step": 39
300
+ },
301
+ {
302
+ "epoch": 10.0,
303
+ "grad_norm": 9.163139343261719,
304
+ "learning_rate": 9.749999999999999e-08,
305
+ "loss": 0.7006,
306
+ "step": 40
307
+ },
308
+ {
309
+ "epoch": 10.290909090909091,
310
+ "grad_norm": 9.795455932617188,
311
+ "learning_rate": 1e-07,
312
+ "loss": 0.7496,
313
+ "step": 41
314
+ },
315
+ {
316
+ "epoch": 10.581818181818182,
317
+ "grad_norm": 9.88698959350586,
318
+ "learning_rate": 9.99982865378877e-08,
319
+ "loss": 0.7978,
320
+ "step": 42
321
+ },
322
+ {
323
+ "epoch": 10.872727272727273,
324
+ "grad_norm": 10.60831069946289,
325
+ "learning_rate": 9.99931462820376e-08,
326
+ "loss": 0.8437,
327
+ "step": 43
328
+ },
329
+ {
330
+ "epoch": 11.0,
331
+ "grad_norm": 10.177803039550781,
332
+ "learning_rate": 9.998457962390006e-08,
333
+ "loss": 0.7926,
334
+ "step": 44
335
+ },
336
+ {
337
+ "epoch": 11.290909090909091,
338
+ "grad_norm": 9.594599723815918,
339
+ "learning_rate": 9.997258721585931e-08,
340
+ "loss": 0.7521,
341
+ "step": 45
342
+ },
343
+ {
344
+ "epoch": 11.581818181818182,
345
+ "grad_norm": 9.713711738586426,
346
+ "learning_rate": 9.99571699711836e-08,
347
+ "loss": 0.7497,
348
+ "step": 46
349
+ },
350
+ {
351
+ "epoch": 11.872727272727273,
352
+ "grad_norm": 10.672869682312012,
353
+ "learning_rate": 9.993832906395581e-08,
354
+ "loss": 0.8709,
355
+ "step": 47
356
+ },
357
+ {
358
+ "epoch": 12.0,
359
+ "grad_norm": 10.758075714111328,
360
+ "learning_rate": 9.991606592898401e-08,
361
+ "loss": 0.8193,
362
+ "step": 48
363
+ },
364
+ {
365
+ "epoch": 12.0,
366
+ "eval_loss": 0.7549822926521301,
367
+ "eval_runtime": 0.7468,
368
+ "eval_samples_per_second": 17.407,
369
+ "eval_steps_per_second": 17.407,
370
+ "step": 48
371
  }
372
  ],
373
  "logging_steps": 1,
 
387
  "attributes": {}
388
  }
389
  },
390
+ "total_flos": 1.609839432105984e+16,
391
  "train_batch_size": 1,
392
  "trial_name": null,
393
  "trial_params": null