ljcamargo commited on
Commit
d5dd415
·
verified ·
1 Parent(s): 696ba8b

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56df8a1ec29b8cbb0c42a5264c1932a6249f0599e86f80aff848eb9853130cc0
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:819f332804a581d3bba9599568c4b930cc17c3add617b442d1de87bc2c61522d
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25722e7bf5fd0346d0d888862ebc90ce085ec4c8463a3f71bd2676c7fe82bedb
3
  size 2457459557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758843891eea80d9773d542a9e0b754fa85102134de0c976e5a5692d56fcf5eb
3
  size 2457459557
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdb8b30fd18ca6a24d25d627bd2e13a1e8f8cc7de78183781c4d89f29175eee0
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba1622dcdafd049462e53ff248b7fef51550571b13d9c8fdcf0e035f1f9f15a
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8b125c082de6f20d827ac9ce3a7228054a763972dd6779dfe18031391e49829
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eff7a0a9bded1a7bd3fdba602c9613b8d890d63962a1be5e1c426de3b212f74
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3,
6
  "eval_steps": 500,
7
- "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -533,6 +533,181 @@
533
  "learning_rate": 3.5383064516129035e-05,
534
  "loss": 0.8895,
535
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  }
537
  ],
538
  "logging_steps": 10,
@@ -552,7 +727,7 @@
552
  "attributes": {}
553
  }
554
  },
555
- "total_flos": 1.3535660950272e+16,
556
  "train_batch_size": 2,
557
  "trial_name": null,
558
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
533
  "learning_rate": 3.5383064516129035e-05,
534
  "loss": 0.8895,
535
  "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.304,
539
+ "grad_norm": 23.548425674438477,
540
+ "learning_rate": 3.518145161290323e-05,
541
+ "loss": 0.8265,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.308,
546
+ "grad_norm": 13.085064888000488,
547
+ "learning_rate": 3.497983870967742e-05,
548
+ "loss": 0.699,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.312,
553
+ "grad_norm": 18.08486557006836,
554
+ "learning_rate": 3.477822580645161e-05,
555
+ "loss": 0.8628,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.316,
560
+ "grad_norm": 19.6446590423584,
561
+ "learning_rate": 3.457661290322581e-05,
562
+ "loss": 0.7162,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.32,
567
+ "grad_norm": 12.074228286743164,
568
+ "learning_rate": 3.4375e-05,
569
+ "loss": 0.8185,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.324,
574
+ "grad_norm": 23.04579734802246,
575
+ "learning_rate": 3.41733870967742e-05,
576
+ "loss": 0.7279,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.328,
581
+ "grad_norm": 14.027397155761719,
582
+ "learning_rate": 3.397177419354839e-05,
583
+ "loss": 0.7416,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.332,
588
+ "grad_norm": 14.763921737670898,
589
+ "learning_rate": 3.377016129032258e-05,
590
+ "loss": 0.7644,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.336,
595
+ "grad_norm": 19.76808738708496,
596
+ "learning_rate": 3.3568548387096774e-05,
597
+ "loss": 0.8096,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.34,
602
+ "grad_norm": 18.55767059326172,
603
+ "learning_rate": 3.336693548387097e-05,
604
+ "loss": 0.7125,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.344,
609
+ "grad_norm": 19.148754119873047,
610
+ "learning_rate": 3.3165322580645164e-05,
611
+ "loss": 0.7357,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.348,
616
+ "grad_norm": 23.397655487060547,
617
+ "learning_rate": 3.296370967741936e-05,
618
+ "loss": 0.7064,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.352,
623
+ "grad_norm": 35.228939056396484,
624
+ "learning_rate": 3.2762096774193553e-05,
625
+ "loss": 0.625,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.356,
630
+ "grad_norm": 29.815994262695312,
631
+ "learning_rate": 3.256048387096775e-05,
632
+ "loss": 0.7508,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.36,
637
+ "grad_norm": 15.86763858795166,
638
+ "learning_rate": 3.2358870967741936e-05,
639
+ "loss": 0.6148,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.364,
644
+ "grad_norm": 19.546428680419922,
645
+ "learning_rate": 3.215725806451613e-05,
646
+ "loss": 0.9258,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.368,
651
+ "grad_norm": 31.96335220336914,
652
+ "learning_rate": 3.1955645161290326e-05,
653
+ "loss": 0.7065,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.372,
658
+ "grad_norm": 17.415355682373047,
659
+ "learning_rate": 3.175403225806452e-05,
660
+ "loss": 0.6998,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.376,
665
+ "grad_norm": 11.98726749420166,
666
+ "learning_rate": 3.1552419354838715e-05,
667
+ "loss": 0.8191,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.38,
672
+ "grad_norm": 13.42738151550293,
673
+ "learning_rate": 3.135080645161291e-05,
674
+ "loss": 0.7401,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.384,
679
+ "grad_norm": 13.586627006530762,
680
+ "learning_rate": 3.11491935483871e-05,
681
+ "loss": 0.6598,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.388,
686
+ "grad_norm": 10.619296073913574,
687
+ "learning_rate": 3.0947580645161286e-05,
688
+ "loss": 0.876,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.392,
693
+ "grad_norm": 16.72213363647461,
694
+ "learning_rate": 3.074596774193548e-05,
695
+ "loss": 0.8656,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.396,
700
+ "grad_norm": 13.87735366821289,
701
+ "learning_rate": 3.0544354838709676e-05,
702
+ "loss": 0.5487,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.4,
707
+ "grad_norm": 13.45494556427002,
708
+ "learning_rate": 3.034274193548387e-05,
709
+ "loss": 0.7243,
710
+ "step": 1000
711
  }
712
  ],
713
  "logging_steps": 10,
 
727
  "attributes": {}
728
  }
729
  },
730
+ "total_flos": 1.8015590195712e+16,
731
  "train_batch_size": 2,
732
  "trial_name": null,
733
  "trial_params": null