mgh6 commited on
Commit
6bc24a9
·
verified ·
1 Parent(s): 322a319

Training in progress, epoch 8, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6e077a077f82244e38d0afcc3d1ca738963ebca1861cee05f8c0d24bf3c61c9
3
  size 2682482800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88647830ecd553d3bbcce815c85cc295f4bf39af9e61197684a6bbf2ad0d22cd
3
  size 2682482800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c5ccf025b5e035a93f9afcb300d38710204c9fa4bdd9f102d70848632e5ccff
3
  size 5365108834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a4f68f91aa0a169d492df4d096b1d4770de24a063b76c7cc1a09f608822ee7
3
  size 5365108834
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d33f3dbdb9b3f7dde1b012b4c45dfa6f4e834ae52f1442515a3bb9195da78f3
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21dfc6c263d5ad0f8ba77e03600244b9f2781e61ae66cba4cff3c2ce6c58574f
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62099b39eb8addcb11715a980e6ab00ae65f78659f9cd1992430ce564ecc8e81
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86d123b176365e851d79aa73f522c50da61f447efcfc0bcc767ae1a1949443a3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.0,
5
  "eval_steps": 50,
6
- "global_step": 1638,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -487,6 +487,171 @@
487
  "eval_samples_per_second": 41.566,
488
  "eval_steps_per_second": 20.783,
489
  "step": 1600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  }
491
  ],
492
  "logging_steps": 50,
@@ -506,7 +671,7 @@
506
  "attributes": {}
507
  }
508
  },
509
- "total_flos": 4.210910911822561e+17,
510
  "train_batch_size": 2,
511
  "trial_name": null,
512
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
  "eval_steps": 50,
6
+ "global_step": 2184,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
487
  "eval_samples_per_second": 41.566,
488
  "eval_steps_per_second": 20.783,
489
  "step": 1600
490
+ },
491
+ {
492
+ "epoch": 6.044112578977599,
493
+ "grad_norm": 59.42768859863281,
494
+ "learning_rate": 3.933823529411765e-05,
495
+ "loss": 0.8781,
496
+ "step": 1650
497
+ },
498
+ {
499
+ "epoch": 6.044112578977599,
500
+ "eval_loss": 1.233597993850708,
501
+ "eval_runtime": 117.1571,
502
+ "eval_samples_per_second": 41.193,
503
+ "eval_steps_per_second": 20.596,
504
+ "step": 1650
505
+ },
506
+ {
507
+ "epoch": 6.227914991384262,
508
+ "grad_norm": 68.26610565185547,
509
+ "learning_rate": 3.7500000000000003e-05,
510
+ "loss": 0.8804,
511
+ "step": 1700
512
+ },
513
+ {
514
+ "epoch": 6.227914991384262,
515
+ "eval_loss": 1.2279460430145264,
516
+ "eval_runtime": 116.1011,
517
+ "eval_samples_per_second": 41.567,
518
+ "eval_steps_per_second": 20.784,
519
+ "step": 1700
520
+ },
521
+ {
522
+ "epoch": 6.411717403790925,
523
+ "grad_norm": 77.21823120117188,
524
+ "learning_rate": 3.566176470588235e-05,
525
+ "loss": 0.8733,
526
+ "step": 1750
527
+ },
528
+ {
529
+ "epoch": 6.411717403790925,
530
+ "eval_loss": 1.2353451251983643,
531
+ "eval_runtime": 116.0518,
532
+ "eval_samples_per_second": 41.585,
533
+ "eval_steps_per_second": 20.792,
534
+ "step": 1750
535
+ },
536
+ {
537
+ "epoch": 6.595519816197587,
538
+ "grad_norm": 49.22051239013672,
539
+ "learning_rate": 3.382352941176471e-05,
540
+ "loss": 0.875,
541
+ "step": 1800
542
+ },
543
+ {
544
+ "epoch": 6.595519816197587,
545
+ "eval_loss": 1.2324572801589966,
546
+ "eval_runtime": 116.0982,
547
+ "eval_samples_per_second": 41.568,
548
+ "eval_steps_per_second": 20.784,
549
+ "step": 1800
550
+ },
551
+ {
552
+ "epoch": 6.779322228604251,
553
+ "grad_norm": 61.27114486694336,
554
+ "learning_rate": 3.198529411764706e-05,
555
+ "loss": 0.8634,
556
+ "step": 1850
557
+ },
558
+ {
559
+ "epoch": 6.779322228604251,
560
+ "eval_loss": 1.2263100147247314,
561
+ "eval_runtime": 116.0582,
562
+ "eval_samples_per_second": 41.583,
563
+ "eval_steps_per_second": 20.791,
564
+ "step": 1850
565
+ },
566
+ {
567
+ "epoch": 6.963124641010913,
568
+ "grad_norm": 53.27342224121094,
569
+ "learning_rate": 3.0147058823529413e-05,
570
+ "loss": 0.8647,
571
+ "step": 1900
572
+ },
573
+ {
574
+ "epoch": 6.963124641010913,
575
+ "eval_loss": 1.2306259870529175,
576
+ "eval_runtime": 116.21,
577
+ "eval_samples_per_second": 41.528,
578
+ "eval_steps_per_second": 20.764,
579
+ "step": 1900
580
+ },
581
+ {
582
+ "epoch": 7.143365881677197,
583
+ "grad_norm": 56.99700927734375,
584
+ "learning_rate": 2.8308823529411766e-05,
585
+ "loss": 0.8335,
586
+ "step": 1950
587
+ },
588
+ {
589
+ "epoch": 7.143365881677197,
590
+ "eval_loss": 1.2323832511901855,
591
+ "eval_runtime": 116.0282,
592
+ "eval_samples_per_second": 41.593,
593
+ "eval_steps_per_second": 20.797,
594
+ "step": 1950
595
+ },
596
+ {
597
+ "epoch": 7.32716829408386,
598
+ "grad_norm": 111.48177337646484,
599
+ "learning_rate": 2.647058823529412e-05,
600
+ "loss": 0.8489,
601
+ "step": 2000
602
+ },
603
+ {
604
+ "epoch": 7.32716829408386,
605
+ "eval_loss": 1.2314597368240356,
606
+ "eval_runtime": 116.1391,
607
+ "eval_samples_per_second": 41.554,
608
+ "eval_steps_per_second": 20.777,
609
+ "step": 2000
610
+ },
611
+ {
612
+ "epoch": 7.5109707064905225,
613
+ "grad_norm": 44.07224655151367,
614
+ "learning_rate": 2.4632352941176472e-05,
615
+ "loss": 0.8473,
616
+ "step": 2050
617
+ },
618
+ {
619
+ "epoch": 7.5109707064905225,
620
+ "eval_loss": 1.2360129356384277,
621
+ "eval_runtime": 116.3906,
622
+ "eval_samples_per_second": 41.464,
623
+ "eval_steps_per_second": 20.732,
624
+ "step": 2050
625
+ },
626
+ {
627
+ "epoch": 7.694773118897185,
628
+ "grad_norm": 58.74856948852539,
629
+ "learning_rate": 2.2794117647058825e-05,
630
+ "loss": 0.8422,
631
+ "step": 2100
632
+ },
633
+ {
634
+ "epoch": 7.694773118897185,
635
+ "eval_loss": 1.23045015335083,
636
+ "eval_runtime": 116.4238,
637
+ "eval_samples_per_second": 41.452,
638
+ "eval_steps_per_second": 20.726,
639
+ "step": 2100
640
+ },
641
+ {
642
+ "epoch": 7.878575531303849,
643
+ "grad_norm": 42.6165771484375,
644
+ "learning_rate": 2.0955882352941178e-05,
645
+ "loss": 0.8414,
646
+ "step": 2150
647
+ },
648
+ {
649
+ "epoch": 7.878575531303849,
650
+ "eval_loss": 1.232067346572876,
651
+ "eval_runtime": 115.9906,
652
+ "eval_samples_per_second": 41.607,
653
+ "eval_steps_per_second": 20.803,
654
+ "step": 2150
655
  }
656
  ],
657
  "logging_steps": 50,
 
671
  "attributes": {}
672
  }
673
  },
674
+ "total_flos": 5.614783810576056e+17,
675
  "train_batch_size": 2,
676
  "trial_name": null,
677
  "trial_params": null