NguyenTan commited on
Commit
901be13
·
verified ·
1 Parent(s): 1a0b576

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02fd35dd30945d2a331fcc4229d682044d2333729fe2d642c02c88a4f8950290
3
  size 1583480280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:232d8b560f57f2f3e0c9ea82b2971aa7233a2f0da7541ae72a69d5be5f7c4c0f
3
  size 1583480280
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bfd5531726b88f6d30b16379bf8128025316616e6a5fdc46f82548ddef8b2f4
3
  size 3166958572
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7399efa78dd93a8dfe84c65853be63e1808d3a804e7c5d366051c082ddb0a0bf
3
  size 3166958572
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:172c6e9da1198fecb1104ae5588ac154055d22275bb62749b67f1d60379ff0a7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585a7c86a661a1c7d3b02c426dea20960cb9ee4b64c7bdd75f0ac4d7fe0b9d2f
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14ae2a2128444abab378aa06c09a61a84665f758fcc19fc46f5789b0bc1b5665
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b825a5491bdbff2d6e4a9c3f7df2b4cc6e7db1d9df411de1f4114308ac5fa922
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68aec417c91400a5fbe9c98d7447dabd74ed3b0812272a5f21d640985e919bad
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a368adbb52e795449b24532caf9095d64cedebc3cc6ea07dd29c30f5c86c5a8b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 1000,
3
- "best_metric": 1.5370113849639893,
4
- "best_model_checkpoint": "hieptt/vietnamese-correction-ft/checkpoint-1000",
5
- "epoch": 0.024186136506554445,
6
  "eval_steps": 1000,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -724,6 +724,715 @@
724
  "eval_samples_per_second": 71.621,
725
  "eval_steps_per_second": 0.56,
726
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  }
728
  ],
729
  "logging_steps": 10,
@@ -752,7 +1461,7 @@
752
  "attributes": {}
753
  }
754
  },
755
- "total_flos": 2.054005594049741e+16,
756
  "train_batch_size": 64,
757
  "trial_name": null,
758
  "trial_params": null
 
1
  {
2
+ "best_global_step": 2000,
3
+ "best_metric": 1.5096291303634644,
4
+ "best_model_checkpoint": "hieptt/vietnamese-correction-ft/checkpoint-2000",
5
+ "epoch": 0.04837227301310889,
6
  "eval_steps": 1000,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
724
  "eval_samples_per_second": 71.621,
725
  "eval_steps_per_second": 0.56,
726
  "step": 1000
727
+ },
728
+ {
729
+ "epoch": 0.024427997871619988,
730
+ "grad_norm": 0.48966652154922485,
731
+ "learning_rate": 4.999999583799493e-05,
732
+ "loss": 1.5447,
733
+ "step": 1010
734
+ },
735
+ {
736
+ "epoch": 0.02466985923668553,
737
+ "grad_norm": 0.6714717149734497,
738
+ "learning_rate": 4.999998145081868e-05,
739
+ "loss": 1.5504,
740
+ "step": 1020
741
+ },
742
+ {
743
+ "epoch": 0.024911720601751078,
744
+ "grad_norm": 0.7765893340110779,
745
+ "learning_rate": 4.999995678709439e-05,
746
+ "loss": 1.5565,
747
+ "step": 1030
748
+ },
749
+ {
750
+ "epoch": 0.02515358196681662,
751
+ "grad_norm": 0.4742671549320221,
752
+ "learning_rate": 4.999992184683219e-05,
753
+ "loss": 1.5453,
754
+ "step": 1040
755
+ },
756
+ {
757
+ "epoch": 0.025395443331882164,
758
+ "grad_norm": 0.5306811332702637,
759
+ "learning_rate": 4.999987663004646e-05,
760
+ "loss": 1.5512,
761
+ "step": 1050
762
+ },
763
+ {
764
+ "epoch": 0.02563730469694771,
765
+ "grad_norm": 0.43038302659988403,
766
+ "learning_rate": 4.9999821136755766e-05,
767
+ "loss": 1.5513,
768
+ "step": 1060
769
+ },
770
+ {
771
+ "epoch": 0.025879166062013254,
772
+ "grad_norm": 0.788059413433075,
773
+ "learning_rate": 4.9999755366982925e-05,
774
+ "loss": 1.5326,
775
+ "step": 1070
776
+ },
777
+ {
778
+ "epoch": 0.026121027427078797,
779
+ "grad_norm": 0.4768883287906647,
780
+ "learning_rate": 4.999967932075499e-05,
781
+ "loss": 1.5526,
782
+ "step": 1080
783
+ },
784
+ {
785
+ "epoch": 0.026362888792144344,
786
+ "grad_norm": 0.383400559425354,
787
+ "learning_rate": 4.99995929981032e-05,
788
+ "loss": 1.5518,
789
+ "step": 1090
790
+ },
791
+ {
792
+ "epoch": 0.026604750157209887,
793
+ "grad_norm": 0.5224942564964294,
794
+ "learning_rate": 4.999949639906304e-05,
795
+ "loss": 1.5495,
796
+ "step": 1100
797
+ },
798
+ {
799
+ "epoch": 0.02684661152227543,
800
+ "grad_norm": 0.4375554025173187,
801
+ "learning_rate": 4.999938952367422e-05,
802
+ "loss": 1.5521,
803
+ "step": 1110
804
+ },
805
+ {
806
+ "epoch": 0.027088472887340977,
807
+ "grad_norm": 0.44675424695014954,
808
+ "learning_rate": 4.999927237198069e-05,
809
+ "loss": 1.5475,
810
+ "step": 1120
811
+ },
812
+ {
813
+ "epoch": 0.02733033425240652,
814
+ "grad_norm": 0.5646783709526062,
815
+ "learning_rate": 4.999914494403059e-05,
816
+ "loss": 1.539,
817
+ "step": 1130
818
+ },
819
+ {
820
+ "epoch": 0.027572195617472064,
821
+ "grad_norm": 0.5079995393753052,
822
+ "learning_rate": 4.9999007239876294e-05,
823
+ "loss": 1.5437,
824
+ "step": 1140
825
+ },
826
+ {
827
+ "epoch": 0.02781405698253761,
828
+ "grad_norm": 0.4094880223274231,
829
+ "learning_rate": 4.999885925957443e-05,
830
+ "loss": 1.5354,
831
+ "step": 1150
832
+ },
833
+ {
834
+ "epoch": 0.028055918347603154,
835
+ "grad_norm": 0.4403417408466339,
836
+ "learning_rate": 4.99987010031858e-05,
837
+ "loss": 1.5445,
838
+ "step": 1160
839
+ },
840
+ {
841
+ "epoch": 0.028297779712668697,
842
+ "grad_norm": 0.3404127061367035,
843
+ "learning_rate": 4.9998532470775484e-05,
844
+ "loss": 1.5321,
845
+ "step": 1170
846
+ },
847
+ {
848
+ "epoch": 0.028539641077734244,
849
+ "grad_norm": 0.4042949378490448,
850
+ "learning_rate": 4.999835366241274e-05,
851
+ "loss": 1.5442,
852
+ "step": 1180
853
+ },
854
+ {
855
+ "epoch": 0.028781502442799787,
856
+ "grad_norm": 0.3902073800563812,
857
+ "learning_rate": 4.9998164578171076e-05,
858
+ "loss": 1.5358,
859
+ "step": 1190
860
+ },
861
+ {
862
+ "epoch": 0.02902336380786533,
863
+ "grad_norm": 0.4594404399394989,
864
+ "learning_rate": 4.999796521812822e-05,
865
+ "loss": 1.5282,
866
+ "step": 1200
867
+ },
868
+ {
869
+ "epoch": 0.029265225172930877,
870
+ "grad_norm": 0.4223099648952484,
871
+ "learning_rate": 4.999775558236611e-05,
872
+ "loss": 1.5388,
873
+ "step": 1210
874
+ },
875
+ {
876
+ "epoch": 0.02950708653799642,
877
+ "grad_norm": 0.6008235812187195,
878
+ "learning_rate": 4.999753567097094e-05,
879
+ "loss": 1.5392,
880
+ "step": 1220
881
+ },
882
+ {
883
+ "epoch": 0.029748947903061963,
884
+ "grad_norm": 0.5003873705863953,
885
+ "learning_rate": 4.9997305484033085e-05,
886
+ "loss": 1.5434,
887
+ "step": 1230
888
+ },
889
+ {
890
+ "epoch": 0.02999080926812751,
891
+ "grad_norm": 0.5244422554969788,
892
+ "learning_rate": 4.999706502164718e-05,
893
+ "loss": 1.5481,
894
+ "step": 1240
895
+ },
896
+ {
897
+ "epoch": 0.030232670633193053,
898
+ "grad_norm": 0.36595821380615234,
899
+ "learning_rate": 4.999681428391207e-05,
900
+ "loss": 1.544,
901
+ "step": 1250
902
+ },
903
+ {
904
+ "epoch": 0.030474531998258596,
905
+ "grad_norm": 0.5237463116645813,
906
+ "learning_rate": 4.999655327093081e-05,
907
+ "loss": 1.5377,
908
+ "step": 1260
909
+ },
910
+ {
911
+ "epoch": 0.030716393363324143,
912
+ "grad_norm": 0.4382268190383911,
913
+ "learning_rate": 4.999628198281072e-05,
914
+ "loss": 1.5382,
915
+ "step": 1270
916
+ },
917
+ {
918
+ "epoch": 0.030958254728389686,
919
+ "grad_norm": 0.5116040706634521,
920
+ "learning_rate": 4.999600041966328e-05,
921
+ "loss": 1.5383,
922
+ "step": 1280
923
+ },
924
+ {
925
+ "epoch": 0.031200116093455233,
926
+ "grad_norm": 0.3517632782459259,
927
+ "learning_rate": 4.999570858160426e-05,
928
+ "loss": 1.5284,
929
+ "step": 1290
930
+ },
931
+ {
932
+ "epoch": 0.03144197745852077,
933
+ "grad_norm": 0.46076980233192444,
934
+ "learning_rate": 4.999540646875361e-05,
935
+ "loss": 1.5347,
936
+ "step": 1300
937
+ },
938
+ {
939
+ "epoch": 0.03168383882358632,
940
+ "grad_norm": 0.6168367266654968,
941
+ "learning_rate": 4.9995094081235524e-05,
942
+ "loss": 1.5387,
943
+ "step": 1310
944
+ },
945
+ {
946
+ "epoch": 0.031925700188651866,
947
+ "grad_norm": 0.40505921840667725,
948
+ "learning_rate": 4.9994771419178396e-05,
949
+ "loss": 1.5375,
950
+ "step": 1320
951
+ },
952
+ {
953
+ "epoch": 0.03216756155371741,
954
+ "grad_norm": 0.4371592104434967,
955
+ "learning_rate": 4.999443848271489e-05,
956
+ "loss": 1.5363,
957
+ "step": 1330
958
+ },
959
+ {
960
+ "epoch": 0.03240942291878295,
961
+ "grad_norm": 0.518997311592102,
962
+ "learning_rate": 4.9994095271981835e-05,
963
+ "loss": 1.5434,
964
+ "step": 1340
965
+ },
966
+ {
967
+ "epoch": 0.032651284283848496,
968
+ "grad_norm": 0.8396134972572327,
969
+ "learning_rate": 4.999374178712032e-05,
970
+ "loss": 1.5324,
971
+ "step": 1350
972
+ },
973
+ {
974
+ "epoch": 0.03289314564891404,
975
+ "grad_norm": 0.41988566517829895,
976
+ "learning_rate": 4.999337802827566e-05,
977
+ "loss": 1.5314,
978
+ "step": 1360
979
+ },
980
+ {
981
+ "epoch": 0.03313500701397959,
982
+ "grad_norm": 0.3672787845134735,
983
+ "learning_rate": 4.999300399559738e-05,
984
+ "loss": 1.525,
985
+ "step": 1370
986
+ },
987
+ {
988
+ "epoch": 0.03337686837904513,
989
+ "grad_norm": 0.4160480499267578,
990
+ "learning_rate": 4.999261968923922e-05,
991
+ "loss": 1.5298,
992
+ "step": 1380
993
+ },
994
+ {
995
+ "epoch": 0.033618729744110676,
996
+ "grad_norm": 0.5236791372299194,
997
+ "learning_rate": 4.999222510935915e-05,
998
+ "loss": 1.5306,
999
+ "step": 1390
1000
+ },
1001
+ {
1002
+ "epoch": 0.03386059110917622,
1003
+ "grad_norm": 0.4650459587574005,
1004
+ "learning_rate": 4.9991820256119385e-05,
1005
+ "loss": 1.535,
1006
+ "step": 1400
1007
+ },
1008
+ {
1009
+ "epoch": 0.03410245247424176,
1010
+ "grad_norm": 0.39175882935523987,
1011
+ "learning_rate": 4.999140512968634e-05,
1012
+ "loss": 1.5302,
1013
+ "step": 1410
1014
+ },
1015
+ {
1016
+ "epoch": 0.03434431383930731,
1017
+ "grad_norm": 0.35965096950531006,
1018
+ "learning_rate": 4.999097973023065e-05,
1019
+ "loss": 1.5236,
1020
+ "step": 1420
1021
+ },
1022
+ {
1023
+ "epoch": 0.034586175204372856,
1024
+ "grad_norm": 0.3973771333694458,
1025
+ "learning_rate": 4.999054405792718e-05,
1026
+ "loss": 1.5261,
1027
+ "step": 1430
1028
+ },
1029
+ {
1030
+ "epoch": 0.0348280365694384,
1031
+ "grad_norm": 0.5168911218643188,
1032
+ "learning_rate": 4.999009811295503e-05,
1033
+ "loss": 1.5289,
1034
+ "step": 1440
1035
+ },
1036
+ {
1037
+ "epoch": 0.03506989793450394,
1038
+ "grad_norm": 0.4921228587627411,
1039
+ "learning_rate": 4.998964189549751e-05,
1040
+ "loss": 1.537,
1041
+ "step": 1450
1042
+ },
1043
+ {
1044
+ "epoch": 0.035311759299569485,
1045
+ "grad_norm": 0.559264600276947,
1046
+ "learning_rate": 4.9989175405742135e-05,
1047
+ "loss": 1.5322,
1048
+ "step": 1460
1049
+ },
1050
+ {
1051
+ "epoch": 0.03555362066463503,
1052
+ "grad_norm": 0.5126819014549255,
1053
+ "learning_rate": 4.998869864388068e-05,
1054
+ "loss": 1.5369,
1055
+ "step": 1470
1056
+ },
1057
+ {
1058
+ "epoch": 0.03579548202970058,
1059
+ "grad_norm": 0.4884808361530304,
1060
+ "learning_rate": 4.998821161010912e-05,
1061
+ "loss": 1.5359,
1062
+ "step": 1480
1063
+ },
1064
+ {
1065
+ "epoch": 0.03603734339476612,
1066
+ "grad_norm": 1.4691296815872192,
1067
+ "learning_rate": 4.9987714304627655e-05,
1068
+ "loss": 1.529,
1069
+ "step": 1490
1070
+ },
1071
+ {
1072
+ "epoch": 0.036279204759831665,
1073
+ "grad_norm": 23.75047492980957,
1074
+ "learning_rate": 4.9987206727640703e-05,
1075
+ "loss": 1.9818,
1076
+ "step": 1500
1077
+ },
1078
+ {
1079
+ "epoch": 0.03652106612489721,
1080
+ "grad_norm": 1.1937427520751953,
1081
+ "learning_rate": 4.998668887935691e-05,
1082
+ "loss": 2.3099,
1083
+ "step": 1510
1084
+ },
1085
+ {
1086
+ "epoch": 0.03676292748996275,
1087
+ "grad_norm": 1.2184133529663086,
1088
+ "learning_rate": 4.998616075998916e-05,
1089
+ "loss": 1.9202,
1090
+ "step": 1520
1091
+ },
1092
+ {
1093
+ "epoch": 0.037004788855028295,
1094
+ "grad_norm": 0.720676839351654,
1095
+ "learning_rate": 4.9985622369754525e-05,
1096
+ "loss": 1.8545,
1097
+ "step": 1530
1098
+ },
1099
+ {
1100
+ "epoch": 0.037246650220093845,
1101
+ "grad_norm": 0.4644893705844879,
1102
+ "learning_rate": 4.998507370887433e-05,
1103
+ "loss": 1.6034,
1104
+ "step": 1540
1105
+ },
1106
+ {
1107
+ "epoch": 0.03748851158515939,
1108
+ "grad_norm": 0.6309983134269714,
1109
+ "learning_rate": 4.9984514777574085e-05,
1110
+ "loss": 1.5414,
1111
+ "step": 1550
1112
+ },
1113
+ {
1114
+ "epoch": 0.03773037295022493,
1115
+ "grad_norm": 0.3813267648220062,
1116
+ "learning_rate": 4.998394557608358e-05,
1117
+ "loss": 1.5335,
1118
+ "step": 1560
1119
+ },
1120
+ {
1121
+ "epoch": 0.037972234315290475,
1122
+ "grad_norm": 0.7492319941520691,
1123
+ "learning_rate": 4.998336610463677e-05,
1124
+ "loss": 1.5299,
1125
+ "step": 1570
1126
+ },
1127
+ {
1128
+ "epoch": 0.03821409568035602,
1129
+ "grad_norm": 0.5672308802604675,
1130
+ "learning_rate": 4.998277636347186e-05,
1131
+ "loss": 1.5323,
1132
+ "step": 1580
1133
+ },
1134
+ {
1135
+ "epoch": 0.03845595704542156,
1136
+ "grad_norm": 0.3646668791770935,
1137
+ "learning_rate": 4.998217635283127e-05,
1138
+ "loss": 1.525,
1139
+ "step": 1590
1140
+ },
1141
+ {
1142
+ "epoch": 0.03869781841048711,
1143
+ "grad_norm": 0.46738356351852417,
1144
+ "learning_rate": 4.998156607296163e-05,
1145
+ "loss": 1.5258,
1146
+ "step": 1600
1147
+ },
1148
+ {
1149
+ "epoch": 0.038939679775552655,
1150
+ "grad_norm": 0.413133442401886,
1151
+ "learning_rate": 4.998094552411382e-05,
1152
+ "loss": 1.5317,
1153
+ "step": 1610
1154
+ },
1155
+ {
1156
+ "epoch": 0.0391815411406182,
1157
+ "grad_norm": 0.9869425892829895,
1158
+ "learning_rate": 4.9980314706542916e-05,
1159
+ "loss": 1.5286,
1160
+ "step": 1620
1161
+ },
1162
+ {
1163
+ "epoch": 0.03942340250568374,
1164
+ "grad_norm": 0.44352006912231445,
1165
+ "learning_rate": 4.997967362050824e-05,
1166
+ "loss": 1.518,
1167
+ "step": 1630
1168
+ },
1169
+ {
1170
+ "epoch": 0.039665263870749284,
1171
+ "grad_norm": 0.33023595809936523,
1172
+ "learning_rate": 4.997902226627329e-05,
1173
+ "loss": 1.5239,
1174
+ "step": 1640
1175
+ },
1176
+ {
1177
+ "epoch": 0.03990712523581483,
1178
+ "grad_norm": 0.5091515779495239,
1179
+ "learning_rate": 4.997836064410583e-05,
1180
+ "loss": 1.524,
1181
+ "step": 1650
1182
+ },
1183
+ {
1184
+ "epoch": 0.04014898660088038,
1185
+ "grad_norm": 0.42869803309440613,
1186
+ "learning_rate": 4.997768875427782e-05,
1187
+ "loss": 1.5244,
1188
+ "step": 1660
1189
+ },
1190
+ {
1191
+ "epoch": 0.04039084796594592,
1192
+ "grad_norm": 0.40443161129951477,
1193
+ "learning_rate": 4.997700659706545e-05,
1194
+ "loss": 1.5201,
1195
+ "step": 1670
1196
+ },
1197
+ {
1198
+ "epoch": 0.040632709331011464,
1199
+ "grad_norm": 0.37971532344818115,
1200
+ "learning_rate": 4.997631417274914e-05,
1201
+ "loss": 1.5283,
1202
+ "step": 1680
1203
+ },
1204
+ {
1205
+ "epoch": 0.04087457069607701,
1206
+ "grad_norm": 0.4408821165561676,
1207
+ "learning_rate": 4.997561148161351e-05,
1208
+ "loss": 1.5241,
1209
+ "step": 1690
1210
+ },
1211
+ {
1212
+ "epoch": 0.04111643206114255,
1213
+ "grad_norm": 0.5017372965812683,
1214
+ "learning_rate": 4.997489852394741e-05,
1215
+ "loss": 1.519,
1216
+ "step": 1700
1217
+ },
1218
+ {
1219
+ "epoch": 0.0413582934262081,
1220
+ "grad_norm": 0.3806293308734894,
1221
+ "learning_rate": 4.997417530004391e-05,
1222
+ "loss": 1.5278,
1223
+ "step": 1710
1224
+ },
1225
+ {
1226
+ "epoch": 0.041600154791273644,
1227
+ "grad_norm": 1.141066312789917,
1228
+ "learning_rate": 4.9973441810200306e-05,
1229
+ "loss": 1.5174,
1230
+ "step": 1720
1231
+ },
1232
+ {
1233
+ "epoch": 0.04184201615633919,
1234
+ "grad_norm": 0.3906162977218628,
1235
+ "learning_rate": 4.997269805471809e-05,
1236
+ "loss": 1.519,
1237
+ "step": 1730
1238
+ },
1239
+ {
1240
+ "epoch": 0.04208387752140473,
1241
+ "grad_norm": 0.5911729335784912,
1242
+ "learning_rate": 4.997194403390302e-05,
1243
+ "loss": 1.536,
1244
+ "step": 1740
1245
+ },
1246
+ {
1247
+ "epoch": 0.042325738886470274,
1248
+ "grad_norm": 0.6229117512702942,
1249
+ "learning_rate": 4.9971179748065024e-05,
1250
+ "loss": 1.5263,
1251
+ "step": 1750
1252
+ },
1253
+ {
1254
+ "epoch": 0.04256760025153582,
1255
+ "grad_norm": 0.4941336512565613,
1256
+ "learning_rate": 4.997040519751828e-05,
1257
+ "loss": 1.5202,
1258
+ "step": 1760
1259
+ },
1260
+ {
1261
+ "epoch": 0.04280946161660137,
1262
+ "grad_norm": 0.6714040040969849,
1263
+ "learning_rate": 4.996962038258117e-05,
1264
+ "loss": 1.5184,
1265
+ "step": 1770
1266
+ },
1267
+ {
1268
+ "epoch": 0.04305132298166691,
1269
+ "grad_norm": 0.4575778841972351,
1270
+ "learning_rate": 4.9968825303576314e-05,
1271
+ "loss": 1.5265,
1272
+ "step": 1780
1273
+ },
1274
+ {
1275
+ "epoch": 0.043293184346732454,
1276
+ "grad_norm": 0.3734686076641083,
1277
+ "learning_rate": 4.996801996083052e-05,
1278
+ "loss": 1.5223,
1279
+ "step": 1790
1280
+ },
1281
+ {
1282
+ "epoch": 0.043535045711798,
1283
+ "grad_norm": 0.6092630624771118,
1284
+ "learning_rate": 4.996720435467485e-05,
1285
+ "loss": 1.5184,
1286
+ "step": 1800
1287
+ },
1288
+ {
1289
+ "epoch": 0.04377690707686354,
1290
+ "grad_norm": 0.31611162424087524,
1291
+ "learning_rate": 4.9966378485444567e-05,
1292
+ "loss": 1.5201,
1293
+ "step": 1810
1294
+ },
1295
+ {
1296
+ "epoch": 0.04401876844192908,
1297
+ "grad_norm": 0.4829297661781311,
1298
+ "learning_rate": 4.9965542353479144e-05,
1299
+ "loss": 1.519,
1300
+ "step": 1820
1301
+ },
1302
+ {
1303
+ "epoch": 0.044260629806994634,
1304
+ "grad_norm": 0.4227820634841919,
1305
+ "learning_rate": 4.9964695959122294e-05,
1306
+ "loss": 1.5147,
1307
+ "step": 1830
1308
+ },
1309
+ {
1310
+ "epoch": 0.04450249117206018,
1311
+ "grad_norm": 0.4444202184677124,
1312
+ "learning_rate": 4.9963839302721936e-05,
1313
+ "loss": 1.5241,
1314
+ "step": 1840
1315
+ },
1316
+ {
1317
+ "epoch": 0.04474435253712572,
1318
+ "grad_norm": 0.42105644941329956,
1319
+ "learning_rate": 4.99629723846302e-05,
1320
+ "loss": 1.5248,
1321
+ "step": 1850
1322
+ },
1323
+ {
1324
+ "epoch": 0.04498621390219126,
1325
+ "grad_norm": 0.34201350808143616,
1326
+ "learning_rate": 4.996209520520346e-05,
1327
+ "loss": 1.5097,
1328
+ "step": 1860
1329
+ },
1330
+ {
1331
+ "epoch": 0.045228075267256806,
1332
+ "grad_norm": 0.410153865814209,
1333
+ "learning_rate": 4.9961207764802275e-05,
1334
+ "loss": 1.5191,
1335
+ "step": 1870
1336
+ },
1337
+ {
1338
+ "epoch": 0.04546993663232235,
1339
+ "grad_norm": 0.38393330574035645,
1340
+ "learning_rate": 4.996031006379145e-05,
1341
+ "loss": 1.5119,
1342
+ "step": 1880
1343
+ },
1344
+ {
1345
+ "epoch": 0.0457117979973879,
1346
+ "grad_norm": 0.3539496958255768,
1347
+ "learning_rate": 4.9959402102539986e-05,
1348
+ "loss": 1.5105,
1349
+ "step": 1890
1350
+ },
1351
+ {
1352
+ "epoch": 0.04595365936245344,
1353
+ "grad_norm": 0.8583787679672241,
1354
+ "learning_rate": 4.995848388142112e-05,
1355
+ "loss": 1.5276,
1356
+ "step": 1900
1357
+ },
1358
+ {
1359
+ "epoch": 0.046195520727518986,
1360
+ "grad_norm": 0.3652508854866028,
1361
+ "learning_rate": 4.995755540081229e-05,
1362
+ "loss": 1.5133,
1363
+ "step": 1910
1364
+ },
1365
+ {
1366
+ "epoch": 0.04643738209258453,
1367
+ "grad_norm": 0.7512590885162354,
1368
+ "learning_rate": 4.995661666109518e-05,
1369
+ "loss": 1.5167,
1370
+ "step": 1920
1371
+ },
1372
+ {
1373
+ "epoch": 0.04667924345765007,
1374
+ "grad_norm": 0.4336129128932953,
1375
+ "learning_rate": 4.9955667662655636e-05,
1376
+ "loss": 1.5171,
1377
+ "step": 1930
1378
+ },
1379
+ {
1380
+ "epoch": 0.046921104822715616,
1381
+ "grad_norm": 0.4716378450393677,
1382
+ "learning_rate": 4.995470840588379e-05,
1383
+ "loss": 1.5336,
1384
+ "step": 1940
1385
+ },
1386
+ {
1387
+ "epoch": 0.047162966187781166,
1388
+ "grad_norm": 0.3509134352207184,
1389
+ "learning_rate": 4.995373889117393e-05,
1390
+ "loss": 1.5282,
1391
+ "step": 1950
1392
+ },
1393
+ {
1394
+ "epoch": 0.04740482755284671,
1395
+ "grad_norm": 0.6889932155609131,
1396
+ "learning_rate": 4.99527591189246e-05,
1397
+ "loss": 1.515,
1398
+ "step": 1960
1399
+ },
1400
+ {
1401
+ "epoch": 0.04764668891791225,
1402
+ "grad_norm": 0.37906014919281006,
1403
+ "learning_rate": 4.995176908953854e-05,
1404
+ "loss": 1.5097,
1405
+ "step": 1970
1406
+ },
1407
+ {
1408
+ "epoch": 0.047888550282977796,
1409
+ "grad_norm": 0.4350769519805908,
1410
+ "learning_rate": 4.995076880342271e-05,
1411
+ "loss": 1.5081,
1412
+ "step": 1980
1413
+ },
1414
+ {
1415
+ "epoch": 0.04813041164804334,
1416
+ "grad_norm": 0.33059579133987427,
1417
+ "learning_rate": 4.994975826098831e-05,
1418
+ "loss": 1.5157,
1419
+ "step": 1990
1420
+ },
1421
+ {
1422
+ "epoch": 0.04837227301310889,
1423
+ "grad_norm": 0.4527088701725006,
1424
+ "learning_rate": 4.994873746265073e-05,
1425
+ "loss": 1.5202,
1426
+ "step": 2000
1427
+ },
1428
+ {
1429
+ "epoch": 0.04837227301310889,
1430
+ "eval_loss": 1.5096291303634644,
1431
+ "eval_runtime": 1228.5547,
1432
+ "eval_sacrebleu": 96.66770045228822,
1433
+ "eval_samples_per_second": 82.248,
1434
+ "eval_steps_per_second": 0.643,
1435
+ "step": 2000
1436
  }
1437
  ],
1438
  "logging_steps": 10,
 
1461
  "attributes": {}
1462
  }
1463
  },
1464
+ "total_flos": 4.116679707171226e+16,
1465
  "train_batch_size": 64,
1466
  "trial_name": null,
1467
  "trial_params": null