8BitStudio commited on
Commit
df88288
·
verified ·
1 Parent(s): 754e294

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d307d63e2bd5721810a84f6c0aded4fa80bef46ee1e253293ea2ea57f13b2b35
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed7f5192373055df50388d1e8a342b0008cc7f264c290f2f40d0816847f2899
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec9a01e5ca2a705c9e40bcd141b1b30c981cdb1ecde544437140b15a37cec10c
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b8be61aa4b411ba072b5dd099697cc18dd1215103eeea9cd79dbfb70d181d7a
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:181e60f1f01e165fe1ea237e5ea1bc5e876c0b1a74e9355ac894a44ab5895cb2
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f11385365889b74991a13277667854d4ee120983e8addb357d466767c0b9ff
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e91a2aad683dbb34b2ed1315719473de36da58f4dbb5b8ab53f09c5f23b65cac
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac42a4d50be277865df4f8c22478009406dfd138fc6ebe8a41f41d644b86db8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.02185792349726776,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -288,6 +288,286 @@
288
  "learning_rate": 0.000249875,
289
  "loss": 3.3105,
290
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  }
292
  ],
293
  "logging_steps": 50,
@@ -307,7 +587,7 @@
307
  "attributes": {}
308
  }
309
  },
310
- "total_flos": 1.069483900796928e+18,
311
  "train_batch_size": 16,
312
  "trial_name": null,
313
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0060874316939892,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
288
  "learning_rate": 0.000249875,
289
  "loss": 3.3105,
290
  "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.022404371584699455,
294
+ "grad_norm": 0.86328125,
295
+ "learning_rate": 0.000256125,
296
+ "loss": 3.25,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.022950819672131147,
301
+ "grad_norm": 0.9375,
302
+ "learning_rate": 0.00026237499999999997,
303
+ "loss": 3.1414,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.023497267759562842,
308
+ "grad_norm": 0.86328125,
309
+ "learning_rate": 0.000268625,
310
+ "loss": 3.1565,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.024043715846994537,
315
+ "grad_norm": 0.80859375,
316
+ "learning_rate": 0.000274875,
317
+ "loss": 3.1131,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.02459016393442623,
322
+ "grad_norm": 0.91015625,
323
+ "learning_rate": 0.00028112499999999996,
324
+ "loss": 3.0784,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.025136612021857924,
329
+ "grad_norm": 0.80859375,
330
+ "learning_rate": 0.000287375,
331
+ "loss": 3.0332,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.025683060109289616,
336
+ "grad_norm": 0.85546875,
337
+ "learning_rate": 0.000293625,
338
+ "loss": 3.0955,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.02622950819672131,
343
+ "grad_norm": 0.734375,
344
+ "learning_rate": 0.000299875,
345
+ "loss": 3.045,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.026775956284153007,
350
+ "grad_norm": 0.8359375,
351
+ "learning_rate": 0.0002999997761290961,
352
+ "loss": 2.995,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.0273224043715847,
357
+ "grad_norm": 0.75390625,
358
+ "learning_rate": 0.0002999990861486685,
359
+ "loss": 2.9428,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.027868852459016394,
364
+ "grad_norm": 0.71484375,
365
+ "learning_rate": 0.00029999792996762107,
366
+ "loss": 2.9131,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.02841530054644809,
371
+ "grad_norm": 0.88671875,
372
+ "learning_rate": 0.00029999630758954706,
373
+ "loss": 2.896,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.02896174863387978,
378
+ "grad_norm": 0.73828125,
379
+ "learning_rate": 0.000299994219019489,
380
+ "loss": 2.8605,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.029508196721311476,
385
+ "grad_norm": 0.66796875,
386
+ "learning_rate": 0.0002999916642639382,
387
+ "loss": 2.8407,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.030054644808743168,
392
+ "grad_norm": 0.68359375,
393
+ "learning_rate": 0.0002999886433308348,
394
+ "loss": 2.8313,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.030601092896174863,
399
+ "grad_norm": 0.69140625,
400
+ "learning_rate": 0.00029998515622956803,
401
+ "loss": 2.8194,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.03114754098360656,
406
+ "grad_norm": 0.61328125,
407
+ "learning_rate": 0.00029998120297097586,
408
+ "loss": 2.7874,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.03169398907103825,
413
+ "grad_norm": 0.67578125,
414
+ "learning_rate": 0.00029997678356734504,
415
+ "loss": 2.7631,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.03224043715846994,
420
+ "grad_norm": 0.6875,
421
+ "learning_rate": 0.0002999718980324113,
422
+ "loss": 2.7603,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.03278688524590164,
427
+ "grad_norm": 0.62890625,
428
+ "learning_rate": 0.0002999665463813589,
429
+ "loss": 2.7229,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.03333333333333333,
434
+ "grad_norm": 0.671875,
435
+ "learning_rate": 0.00029996072863082093,
436
+ "loss": 2.7895,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.033879781420765025,
441
+ "grad_norm": 0.91796875,
442
+ "learning_rate": 0.0002999544447988791,
443
+ "loss": 2.6505,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.03442622950819672,
448
+ "grad_norm": 0.60546875,
449
+ "learning_rate": 0.0002999476949050637,
450
+ "loss": 2.6744,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.034972677595628415,
455
+ "grad_norm": 0.59375,
456
+ "learning_rate": 0.0002999404789703535,
457
+ "loss": 2.6869,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.03551912568306011,
462
+ "grad_norm": 0.76953125,
463
+ "learning_rate": 0.0002999327970171759,
464
+ "loss": 2.6726,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.036065573770491806,
469
+ "grad_norm": 0.66015625,
470
+ "learning_rate": 0.0002999246490694065,
471
+ "loss": 2.6444,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.0366120218579235,
476
+ "grad_norm": 0.69921875,
477
+ "learning_rate": 0.0002999160351523693,
478
+ "loss": 2.6568,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.03715846994535519,
483
+ "grad_norm": 0.625,
484
+ "learning_rate": 0.00029990695529283665,
485
+ "loss": 2.6436,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 1.0000765027322405,
490
+ "grad_norm": 0.5390625,
491
+ "learning_rate": 0.00029989740951902885,
492
+ "loss": 2.6468,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 1.0006229508196722,
497
+ "grad_norm": 0.578125,
498
+ "learning_rate": 0.0002998873978606145,
499
+ "loss": 2.5703,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 1.0011693989071038,
504
+ "grad_norm": 0.6171875,
505
+ "learning_rate": 0.0002998769203487099,
506
+ "loss": 2.6321,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 1.0017158469945355,
511
+ "grad_norm": 0.62109375,
512
+ "learning_rate": 0.0002998659770158796,
513
+ "loss": 2.5518,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 1.0022622950819673,
518
+ "grad_norm": 0.6171875,
519
+ "learning_rate": 0.0002998545678961356,
520
+ "loss": 2.5255,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 1.0028087431693988,
525
+ "grad_norm": 0.66015625,
526
+ "learning_rate": 0.00029984269302493776,
527
+ "loss": 2.4976,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 1.0033551912568306,
532
+ "grad_norm": 0.58984375,
533
+ "learning_rate": 0.0002998303524391934,
534
+ "loss": 2.532,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 1.0039016393442624,
539
+ "grad_norm": 0.6484375,
540
+ "learning_rate": 0.00029981754617725747,
541
+ "loss": 2.5321,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 1.004448087431694,
546
+ "grad_norm": 0.53515625,
547
+ "learning_rate": 0.0002998042742789319,
548
+ "loss": 2.4924,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 1.0049945355191257,
553
+ "grad_norm": 0.6015625,
554
+ "learning_rate": 0.0002997905367854663,
555
+ "loss": 2.492,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 1.0055409836065574,
560
+ "grad_norm": 0.61328125,
561
+ "learning_rate": 0.00029977633373955696,
562
+ "loss": 2.5266,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 1.0060874316939892,
567
+ "grad_norm": 0.58984375,
568
+ "learning_rate": 0.00029976166518534735,
569
+ "loss": 2.4739,
570
+ "step": 4000
571
  }
572
  ],
573
  "logging_steps": 50,
 
587
  "attributes": {}
588
  }
589
  },
590
+ "total_flos": 2.1391181977674056e+18,
591
  "train_batch_size": 16,
592
  "trial_name": null,
593
  "trial_params": null