ljcamargo commited on
Commit
2f614ab
·
verified ·
1 Parent(s): 79d3c11

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b9d88423883eaf1c04e57a3842d0888c8c7e8d00f10a953643dbcd1d1ac64c4
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70bb5bb361cb2c44a3c95065d77a422ed9649be5bd191a41e78d34e35834847b
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34d645606d6eb146eb99e0cc0adef1d87d290800358b32916d2b761d315b2c6b
3
- size 2061570519
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098af36ab395a778e4ad67ddfa0191cced4901cc9518f1a83364c1e0ed5e1dbe
3
+ size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d383a88beb779fca7791b337addbc04045039df50bd4a6ea6c5557699bd1e48
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e4743b658fa9de4e54f29c9ad2e962e40b252677e30a13b5a792846f596864f
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc1a0da602f8abf4bf342932694d528cc1f0baa4d5027de58ad34f4d9855d085
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e35963fbe17703d43e57c264c8bf401c049828d6ea5abe6c269f936eebec007
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2aecfedc7f5bc992e41736b72929d50222b1c844811a96ac093ccc646bdc5dd7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bb282d448dcf74bd6fbccf99dc933faaae6d52cfc91f1d2df7df3c6a133ab1a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.08609556607834697,
6
  "eval_steps": 300,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -218,6 +218,216 @@
218
  "learning_rate": 0.00019782875990384568,
219
  "loss": 1.0476,
220
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
222
  ],
223
  "logging_steps": 10,
@@ -237,7 +447,7 @@
237
  "attributes": {}
238
  }
239
  },
240
- "total_flos": 1.2275648299008e+19,
241
  "train_batch_size": 6,
242
  "trial_name": null,
243
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.17219113215669393,
6
  "eval_steps": 300,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
218
  "learning_rate": 0.00019782875990384568,
219
  "loss": 1.0476,
220
  "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.08896541828095852,
224
+ "grad_norm": 8.840872764587402,
225
+ "learning_rate": 0.00019763402207343338,
226
+ "loss": 1.0478,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.0918352704835701,
231
+ "grad_norm": 11.326393127441406,
232
+ "learning_rate": 0.00019743102648521967,
233
+ "loss": 1.0235,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.09470512268618166,
238
+ "grad_norm": 15.35113525390625,
239
+ "learning_rate": 0.00019721979030830572,
240
+ "loss": 0.9794,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.09757497488879323,
245
+ "grad_norm": 11.8535795211792,
246
+ "learning_rate": 0.0001970003314087709,
247
+ "loss": 1.0072,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.10044482709140479,
252
+ "grad_norm": 24.779190063476562,
253
+ "learning_rate": 0.0001967726683481617,
254
+ "loss": 1.0056,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.10331467929401636,
259
+ "grad_norm": 20.744426727294922,
260
+ "learning_rate": 0.00019653682038192188,
261
+ "loss": 1.0066,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.10618453149662792,
266
+ "grad_norm": 21.19144630432129,
267
+ "learning_rate": 0.00019629280745776364,
268
+ "loss": 0.9673,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.10905438369923949,
273
+ "grad_norm": 18.140127182006836,
274
+ "learning_rate": 0.0001960406502139808,
275
+ "loss": 0.9903,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.11192423590185105,
280
+ "grad_norm": 19.997053146362305,
281
+ "learning_rate": 0.00019578036997770296,
282
+ "loss": 0.9715,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.11479408810446262,
287
+ "grad_norm": 15.790470123291016,
288
+ "learning_rate": 0.0001955119887630919,
289
+ "loss": 0.9508,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.11766394030707418,
294
+ "grad_norm": 18.330507278442383,
295
+ "learning_rate": 0.0001952355292694795,
296
+ "loss": 0.9867,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.12053379250968575,
301
+ "grad_norm": 13.211642265319824,
302
+ "learning_rate": 0.0001949510148794478,
303
+ "loss": 1.0481,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.12340364471229731,
308
+ "grad_norm": 9.442767143249512,
309
+ "learning_rate": 0.00019465846965685158,
310
+ "loss": 0.9686,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.12627349691490888,
315
+ "grad_norm": 15.597809791564941,
316
+ "learning_rate": 0.00019435791834478293,
317
+ "loss": 1.0821,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.12914334911752046,
322
+ "grad_norm": 13.517879486083984,
323
+ "learning_rate": 0.0001940493863634784,
324
+ "loss": 0.9397,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.132013201320132,
329
+ "grad_norm": 13.031438827514648,
330
+ "learning_rate": 0.00019373289980816917,
331
+ "loss": 1.0009,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.13488305352274357,
336
+ "grad_norm": 14.64666748046875,
337
+ "learning_rate": 0.00019340848544687386,
338
+ "loss": 0.9571,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.13775290572535515,
343
+ "grad_norm": 10.706031799316406,
344
+ "learning_rate": 0.00019307617071813454,
345
+ "loss": 1.0283,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.14062275792796672,
350
+ "grad_norm": 9.723997116088867,
351
+ "learning_rate": 0.00019273598372869603,
352
+ "loss": 0.9815,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.14349261013057826,
357
+ "grad_norm": 9.667860984802246,
358
+ "learning_rate": 0.0001923879532511287,
359
+ "loss": 0.9424,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.14636246233318984,
364
+ "grad_norm": 6.956273078918457,
365
+ "learning_rate": 0.00019203210872139476,
366
+ "loss": 0.9793,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1492323145358014,
371
+ "grad_norm": 15.395605087280273,
372
+ "learning_rate": 0.00019166848023635883,
373
+ "loss": 1.0637,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.15210216673841298,
378
+ "grad_norm": 23.60310173034668,
379
+ "learning_rate": 0.0001912970985512422,
380
+ "loss": 0.9625,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.15497201894102453,
385
+ "grad_norm": 20.658727645874023,
386
+ "learning_rate": 0.00019091799507702181,
387
+ "loss": 0.9393,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.1578418711436361,
392
+ "grad_norm": 18.22756576538086,
393
+ "learning_rate": 0.0001905312018777733,
394
+ "loss": 0.9354,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.16071172334624767,
399
+ "grad_norm": 11.863499641418457,
400
+ "learning_rate": 0.00019013675166795922,
401
+ "loss": 0.933,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.16358157554885924,
406
+ "grad_norm": 11.65882682800293,
407
+ "learning_rate": 0.00018973467780966202,
408
+ "loss": 0.9119,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.1664514277514708,
413
+ "grad_norm": 11.474069595336914,
414
+ "learning_rate": 0.00018932501430976242,
415
+ "loss": 0.9511,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.16932127995408236,
420
+ "grad_norm": 8.225656509399414,
421
+ "learning_rate": 0.00018890779581706303,
422
+ "loss": 0.9474,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.17219113215669393,
427
+ "grad_norm": 15.780831336975098,
428
+ "learning_rate": 0.00018848305761935797,
429
+ "loss": 0.9528,
430
+ "step": 600
431
  }
432
  ],
433
  "logging_steps": 10,
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 2.4551296598016e+19,
451
  "train_batch_size": 6,
452
  "trial_name": null,
453
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a7dea7536ca58c97fbccef8df0fd6f88f39f81b73f5f4eafbed1d750e825400
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ccf64bfb489d98f1d53ff4b75bafff9ef6970cd7568bffdd38c9685c6b4b38
3
  size 6033