ljcamargo commited on
Commit
aa05ecd
·
verified ·
1 Parent(s): db6cd34

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d40c404ef2c6591a63d62d374d2ae723dbb012f99f314f1f0721032e50b86c4
3
  size 2558403928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ef549acb7bb3a26b9a1d8c83faca397de8618a2dce8c81bde8e287f33fb6c31
3
  size 2558403928
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ffd4ecbcd1f5cdd5bd52f54030b72efa2c358b8e75c6c4731b1e15ea43bd19c
3
  size 1313044361
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f91d2444da719f4454789524b172c58bc341e905a9b460651c04a077f667609
3
  size 1313044361
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fb5c3c2c6a04f8bf56e98b3d5a045f8c1ab465d43652320e01114dda9b0cb0d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0d5dfce4350324a9dd27602ce6d66bb933782beacd43e5d1fc128755bd9060e
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30af866df24edce708e1eb20700878b402fa05707fa9bc5f332496baf440dbbb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb7fde5111803012042c93a73aa191336bb6e10b3ad44f6bd1d94fc7008a22b6
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e2eb54ad71aa36e8a3c519325614d3113e01de2bc05cb8cce62c849b7fd068c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27384781b4bab02662f6aa01507d1435cf787b396a01371737e0e695f3099df9
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.24,
6
  "eval_steps": 500,
7
- "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -225,6 +225,216 @@
225
  "learning_rate": 0.00019674914092067015,
226
  "loss": 8.2001,
227
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  }
229
  ],
230
  "logging_steps": 10,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.48,
6
  "eval_steps": 500,
7
+ "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
225
  "learning_rate": 0.00019674914092067015,
226
  "loss": 8.2001,
227
  "step": 300
228
+ },
229
+ {
230
+ "epoch": 0.248,
231
+ "grad_norm": 4.331130027770996,
232
+ "learning_rate": 0.00019640923091244906,
233
+ "loss": 8.3479,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 0.256,
238
+ "grad_norm": 4.888726711273193,
239
+ "learning_rate": 0.0001960527464775666,
240
+ "loss": 8.0348,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.264,
245
+ "grad_norm": 4.52598762512207,
246
+ "learning_rate": 0.00019567974890190865,
247
+ "loss": 7.9916,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 0.272,
252
+ "grad_norm": 5.109200477600098,
253
+ "learning_rate": 0.00019529030231025776,
254
+ "loss": 7.8621,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 0.28,
259
+ "grad_norm": 4.465454578399658,
260
+ "learning_rate": 0.0001948844736552688,
261
+ "loss": 8.1529,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 0.288,
266
+ "grad_norm": 4.94981050491333,
267
+ "learning_rate": 0.00019446233270595896,
268
+ "loss": 7.9475,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 0.296,
273
+ "grad_norm": 4.898144245147705,
274
+ "learning_rate": 0.00019402395203571286,
275
+ "loss": 8.1256,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 0.304,
280
+ "grad_norm": 4.506499767303467,
281
+ "learning_rate": 0.00019356940700980625,
282
+ "loss": 7.9425,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 0.312,
287
+ "grad_norm": 4.715751647949219,
288
+ "learning_rate": 0.00019309877577244924,
289
+ "loss": 7.8867,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 0.32,
294
+ "grad_norm": 6.232232570648193,
295
+ "learning_rate": 0.00019261213923335194,
296
+ "loss": 8.0137,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 0.328,
301
+ "grad_norm": 5.4095258712768555,
302
+ "learning_rate": 0.0001921095810538148,
303
+ "loss": 7.655,
304
+ "step": 410
305
+ },
306
+ {
307
+ "epoch": 0.336,
308
+ "grad_norm": 8.021153450012207,
309
+ "learning_rate": 0.00019159118763234555,
310
+ "loss": 8.239,
311
+ "step": 420
312
+ },
313
+ {
314
+ "epoch": 0.344,
315
+ "grad_norm": 4.821053504943848,
316
+ "learning_rate": 0.0001910570480898061,
317
+ "loss": 7.6991,
318
+ "step": 430
319
+ },
320
+ {
321
+ "epoch": 0.352,
322
+ "grad_norm": 4.676478385925293,
323
+ "learning_rate": 0.00019050725425409076,
324
+ "loss": 7.9241,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 0.36,
329
+ "grad_norm": 6.322430610656738,
330
+ "learning_rate": 0.0001899419006443397,
331
+ "loss": 8.0194,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 0.368,
336
+ "grad_norm": 3.8518083095550537,
337
+ "learning_rate": 0.0001893610844546894,
338
+ "loss": 7.7739,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 0.376,
343
+ "grad_norm": 4.104583263397217,
344
+ "learning_rate": 0.00018876490553756313,
345
+ "loss": 7.7344,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 0.384,
350
+ "grad_norm": 5.830111980438232,
351
+ "learning_rate": 0.00018815346638650487,
352
+ "loss": 7.4569,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 0.392,
357
+ "grad_norm": 5.279020309448242,
358
+ "learning_rate": 0.0001875268721185585,
359
+ "loss": 7.6329,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 0.4,
364
+ "grad_norm": 6.486227989196777,
365
+ "learning_rate": 0.00018688523045619674,
366
+ "loss": 7.6998,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 0.408,
371
+ "grad_norm": 3.6189093589782715,
372
+ "learning_rate": 0.00018622865170880151,
373
+ "loss": 7.3692,
374
+ "step": 510
375
+ },
376
+ {
377
+ "epoch": 0.416,
378
+ "grad_norm": 4.793766498565674,
379
+ "learning_rate": 0.00018555724875369997,
380
+ "loss": 7.4387,
381
+ "step": 520
382
+ },
383
+ {
384
+ "epoch": 0.424,
385
+ "grad_norm": 3.8143932819366455,
386
+ "learning_rate": 0.00018487113701675883,
387
+ "loss": 7.6311,
388
+ "step": 530
389
+ },
390
+ {
391
+ "epoch": 0.432,
392
+ "grad_norm": 5.568665027618408,
393
+ "learning_rate": 0.00018417043445254075,
394
+ "loss": 7.0967,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 0.44,
399
+ "grad_norm": 5.059378623962402,
400
+ "learning_rate": 0.00018345526152402573,
401
+ "loss": 7.2701,
402
+ "step": 550
403
+ },
404
+ {
405
+ "epoch": 0.448,
406
+ "grad_norm": 5.139848232269287,
407
+ "learning_rate": 0.00018272574118190167,
408
+ "loss": 7.2647,
409
+ "step": 560
410
+ },
411
+ {
412
+ "epoch": 0.456,
413
+ "grad_norm": 6.435779571533203,
414
+ "learning_rate": 0.00018198199884342673,
415
+ "loss": 7.161,
416
+ "step": 570
417
+ },
418
+ {
419
+ "epoch": 0.464,
420
+ "grad_norm": 5.199296951293945,
421
+ "learning_rate": 0.0001812241623708682,
422
+ "loss": 7.5137,
423
+ "step": 580
424
+ },
425
+ {
426
+ "epoch": 0.472,
427
+ "grad_norm": 4.161045074462891,
428
+ "learning_rate": 0.00018045236204952044,
429
+ "loss": 7.356,
430
+ "step": 590
431
+ },
432
+ {
433
+ "epoch": 0.48,
434
+ "grad_norm": 5.084685325622559,
435
+ "learning_rate": 0.00017966673056530686,
436
+ "loss": 7.3859,
437
+ "step": 600
438
  }
439
  ],
440
  "logging_steps": 10,