ljcamargo commited on
Commit
aa9d591
·
verified ·
1 Parent(s): 27f0836

Training in progress, step 1200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12451cbc3f681c4d652adfbac061b94a617fe5288eadb9338e0c0116f1ab09a0
3
  size 3237829088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afcab61d0f0cd6492620de0981d9e3af3b1d7bf197c5b9a30367af1e1384d769
3
  size 3237829088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2ac56c591f7d766084f7d7411df625b2b9385404ec3f7c675d4dea7dc7e6caf
3
  size 2062272049
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4bf418fd1d2214f3b6dd2acb610220e68b528633af9c01c8a0638ef623a8e37
3
  size 2062272049
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dbfcd19dd04e87ea5a0a02be9024658d9d950751b047080030098d848f7be93
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63dea8701860badef7f13a7093d7c8f6df4c5eb7423d37c0b1df9d89c9a49eb9
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:504b7bc543b9e5f039f6559d07b099507a66c15c86836ff5981e4eee51792c02
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b04ef7af3a89dd0eb8778c7ed7d28aeab310d9f53593d47cc2bdc9458a253ac
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb4a875674808f59301dfb232d0f66520f4ad75d8962141e97389a5c2ef15def
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76df728eeb65e9565f5601b8baa2ec9eb380c004fbfe2e79296e73893ec398b4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2582866982350409,
6
  "eval_steps": 300,
7
- "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -638,6 +638,216 @@
638
  "learning_rate": 0.00017244452248319896,
639
  "loss": 0.8771,
640
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  }
642
  ],
643
  "logging_steps": 10,
@@ -657,7 +867,7 @@
657
  "attributes": {}
658
  }
659
  },
660
- "total_flos": 3.6826944897024e+19,
661
  "train_batch_size": 6,
662
  "trial_name": null,
663
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.34438226431338786,
6
  "eval_steps": 300,
7
+ "global_step": 1200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
638
  "learning_rate": 0.00017244452248319896,
639
  "loss": 0.8771,
640
  "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.26115655043765246,
644
+ "grad_norm": 46.30351638793945,
645
+ "learning_rate": 0.00017180750950109504,
646
+ "loss": 0.788,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.264026402640264,
651
+ "grad_norm": 6.262620449066162,
652
+ "learning_rate": 0.0001711644231338208,
653
+ "loss": 0.916,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2668962548428756,
658
+ "grad_norm": 7.936816215515137,
659
+ "learning_rate": 0.00017051531777277952,
660
+ "loss": 0.8425,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.26976610704548715,
665
+ "grad_norm": 10.233474731445312,
666
+ "learning_rate": 0.00016986024831845296,
667
+ "loss": 0.9159,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.27263595924809875,
672
+ "grad_norm": 13.751338958740234,
673
+ "learning_rate": 0.00016919927017575832,
674
+ "loss": 0.8484,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.2755058114507103,
679
+ "grad_norm": 18.70934295654297,
680
+ "learning_rate": 0.00016853243924936173,
681
+ "loss": 0.8387,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.27837566365332184,
686
+ "grad_norm": 6.2156853675842285,
687
+ "learning_rate": 0.0001678598119389502,
688
+ "loss": 0.9127,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.28124551585593344,
693
+ "grad_norm": 10.486414909362793,
694
+ "learning_rate": 0.00016718144513446127,
695
+ "loss": 0.861,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.284115368058545,
700
+ "grad_norm": 7.782724380493164,
701
+ "learning_rate": 0.00016649739621127146,
702
+ "loss": 0.8739,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.2869852202611565,
707
+ "grad_norm": 30.388168334960938,
708
+ "learning_rate": 0.00016580772302534337,
709
+ "loss": 0.9009,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.2898550724637681,
714
+ "grad_norm": 7.943617343902588,
715
+ "learning_rate": 0.0001651124839083324,
716
+ "loss": 0.8113,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.29272492466637967,
721
+ "grad_norm": 8.402076721191406,
722
+ "learning_rate": 0.00016441173766265315,
723
+ "loss": 0.8076,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.29559477686899127,
728
+ "grad_norm": 7.3927764892578125,
729
+ "learning_rate": 0.00016370554355650584,
730
+ "loss": 0.8263,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.2984646290716028,
735
+ "grad_norm": 8.749371528625488,
736
+ "learning_rate": 0.0001629939613188638,
737
+ "loss": 0.8673,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.30133448127421436,
742
+ "grad_norm": 4.924167156219482,
743
+ "learning_rate": 0.0001622770511344213,
744
+ "loss": 0.869,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.30420433347682596,
749
+ "grad_norm": 34.14529037475586,
750
+ "learning_rate": 0.00016155487363850342,
751
+ "loss": 0.9202,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.3070741856794375,
756
+ "grad_norm": 13.217582702636719,
757
+ "learning_rate": 0.00016082748991193757,
758
+ "loss": 0.8409,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.30994403788204905,
763
+ "grad_norm": 19.251298904418945,
764
+ "learning_rate": 0.00016009496147588735,
765
+ "loss": 0.8624,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.31281389008466065,
770
+ "grad_norm": 52.710453033447266,
771
+ "learning_rate": 0.00015935735028664908,
772
+ "loss": 0.8695,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.3156837422872722,
777
+ "grad_norm": 15.96419906616211,
778
+ "learning_rate": 0.00015861471873041184,
779
+ "loss": 0.8773,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.3185535944898838,
784
+ "grad_norm": 7.947400093078613,
785
+ "learning_rate": 0.0001578671296179806,
786
+ "loss": 0.8387,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.32142344669249534,
791
+ "grad_norm": 13.167436599731445,
792
+ "learning_rate": 0.00015711464617946402,
793
+ "loss": 0.8582,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3242932988951069,
798
+ "grad_norm": 11.579595565795898,
799
+ "learning_rate": 0.00015635733205892653,
800
+ "loss": 0.8615,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.3271631510977185,
805
+ "grad_norm": 4.840546131134033,
806
+ "learning_rate": 0.00015559525130900523,
807
+ "loss": 0.822,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.33003300330033003,
812
+ "grad_norm": 8.159014701843262,
813
+ "learning_rate": 0.0001548284683854925,
814
+ "loss": 0.8512,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.3329028555029416,
819
+ "grad_norm": 33.13652038574219,
820
+ "learning_rate": 0.00015405704814188442,
821
+ "loss": 0.8686,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.3357727077055532,
826
+ "grad_norm": 5.398830890655518,
827
+ "learning_rate": 0.00015328105582389557,
828
+ "loss": 0.8685,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.3386425599081647,
833
+ "grad_norm": 23.8563289642334,
834
+ "learning_rate": 0.00015250055706394057,
835
+ "loss": 0.8617,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.3415124121107763,
840
+ "grad_norm": 5.886293411254883,
841
+ "learning_rate": 0.00015171561787558297,
842
+ "loss": 0.8559,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.34438226431338786,
847
+ "grad_norm": 7.887658596038818,
848
+ "learning_rate": 0.000150926304647952,
849
+ "loss": 0.8811,
850
+ "step": 1200
851
  }
852
  ],
853
  "logging_steps": 10,
 
867
  "attributes": {}
868
  }
869
  },
870
+ "total_flos": 4.9102593196032e+19,
871
  "train_batch_size": 6,
872
  "trial_name": null,
873
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91ccf64bfb489d98f1d53ff4b75bafff9ef6970cd7568bffdd38c9685c6b4b38
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c826a3ab5235a63f61a75099a41de538ae2f6fe824df40b96ea1279de029afd1
3
  size 6033