8BitStudio commited on
Commit
5cd35df
·
verified ·
1 Parent(s): f8f1dec

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9310a4b888df283774971e4e671540bfed2da01aea080fa39eda067305eeba86
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa2fa49e5ab01e8388f884f001a6fef59415f0afcdf8851cf32b99cba1b66f98
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1f256b63f8887aa92c9795198c14b259ff29bd76f4e601214dd8ad4add4ccd6
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7bd1f1004e066807e00b62878ad4b49df433de186c64f0a97f9237a03eb281b
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2669ee2d37691d1bc42e7a0090a126e105acbd5de1cf305e31cb6b68e55636b7
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ea0240538fb238def027691182a688f4848085d98c59d8205c56a6ab84887c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a30b126d1da8ae8870320a9f300ee7d428169650eb20c3a488c09fc00bef14d8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2054ec2901370b6a537467b9fa82f13f962dc91e80e60e56cd6658a9567a46a8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0279453551912567,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -848,6 +848,286 @@
848
  "learning_rate": 0.00029879389136398403,
849
  "loss": 2.1958,
850
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
  }
852
  ],
853
  "logging_steps": 50,
@@ -867,7 +1147,7 @@
867
  "attributes": {}
868
  }
869
  },
870
- "total_flos": 3.2086020985643336e+18,
871
  "train_batch_size": 16,
872
  "trial_name": null,
873
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0121748633879784,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
848
  "learning_rate": 0.00029879389136398403,
849
  "loss": 2.1958,
850
  "step": 6000
851
+ },
852
+ {
853
+ "epoch": 1.0284918032786885,
854
+ "grad_norm": 0.52734375,
855
+ "learning_rate": 0.00029876019281020207,
856
+ "loss": 2.1853,
857
+ "step": 6050
858
+ },
859
+ {
860
+ "epoch": 1.0290382513661203,
861
+ "grad_norm": 0.52734375,
862
+ "learning_rate": 0.00029872603190630927,
863
+ "loss": 2.1753,
864
+ "step": 6100
865
+ },
866
+ {
867
+ "epoch": 1.029584699453552,
868
+ "grad_norm": 0.5546875,
869
+ "learning_rate": 0.00029869140875847847,
870
+ "loss": 2.1931,
871
+ "step": 6150
872
+ },
873
+ {
874
+ "epoch": 1.0301311475409836,
875
+ "grad_norm": 0.53125,
876
+ "learning_rate": 0.0002986563234743193,
877
+ "loss": 2.1846,
878
+ "step": 6200
879
+ },
880
+ {
881
+ "epoch": 1.0306775956284153,
882
+ "grad_norm": 0.51953125,
883
+ "learning_rate": 0.0002986207761628775,
884
+ "loss": 2.1928,
885
+ "step": 6250
886
+ },
887
+ {
888
+ "epoch": 1.031224043715847,
889
+ "grad_norm": 0.6171875,
890
+ "learning_rate": 0.00029858476693463506,
891
+ "loss": 2.1942,
892
+ "step": 6300
893
+ },
894
+ {
895
+ "epoch": 1.0317704918032786,
896
+ "grad_norm": 0.48828125,
897
+ "learning_rate": 0.0002985482959015094,
898
+ "loss": 2.1653,
899
+ "step": 6350
900
+ },
901
+ {
902
+ "epoch": 1.0323169398907104,
903
+ "grad_norm": 0.5234375,
904
+ "learning_rate": 0.00029851136317685345,
905
+ "loss": 2.1659,
906
+ "step": 6400
907
+ },
908
+ {
909
+ "epoch": 1.0328633879781421,
910
+ "grad_norm": 0.5234375,
911
+ "learning_rate": 0.00029847396887545485,
912
+ "loss": 2.1829,
913
+ "step": 6450
914
+ },
915
+ {
916
+ "epoch": 1.0334098360655737,
917
+ "grad_norm": 0.54296875,
918
+ "learning_rate": 0.00029843611311353597,
919
+ "loss": 2.1911,
920
+ "step": 6500
921
+ },
922
+ {
923
+ "epoch": 1.0339562841530054,
924
+ "grad_norm": 0.51953125,
925
+ "learning_rate": 0.00029839779600875343,
926
+ "loss": 2.1041,
927
+ "step": 6550
928
+ },
929
+ {
930
+ "epoch": 1.0345027322404372,
931
+ "grad_norm": 0.53125,
932
+ "learning_rate": 0.00029835901768019763,
933
+ "loss": 2.1634,
934
+ "step": 6600
935
+ },
936
+ {
937
+ "epoch": 1.0350491803278687,
938
+ "grad_norm": 0.515625,
939
+ "learning_rate": 0.0002983197782483926,
940
+ "loss": 2.1642,
941
+ "step": 6650
942
+ },
943
+ {
944
+ "epoch": 1.0355956284153005,
945
+ "grad_norm": 0.58203125,
946
+ "learning_rate": 0.00029828007783529533,
947
+ "loss": 2.1621,
948
+ "step": 6700
949
+ },
950
+ {
951
+ "epoch": 1.0361420765027323,
952
+ "grad_norm": 0.5078125,
953
+ "learning_rate": 0.0002982399165642956,
954
+ "loss": 2.1553,
955
+ "step": 6750
956
+ },
957
+ {
958
+ "epoch": 1.036688524590164,
959
+ "grad_norm": 0.51171875,
960
+ "learning_rate": 0.00029819929456021565,
961
+ "loss": 2.1592,
962
+ "step": 6800
963
+ },
964
+ {
965
+ "epoch": 1.0372349726775956,
966
+ "grad_norm": 0.51953125,
967
+ "learning_rate": 0.0002981582119493095,
968
+ "loss": 2.1527,
969
+ "step": 6850
970
+ },
971
+ {
972
+ "epoch": 2.000153005464481,
973
+ "grad_norm": 0.5,
974
+ "learning_rate": 0.0002981166688592629,
975
+ "loss": 2.1674,
976
+ "step": 6900
977
+ },
978
+ {
979
+ "epoch": 2.0006994535519125,
980
+ "grad_norm": 0.5390625,
981
+ "learning_rate": 0.00029807466541919273,
982
+ "loss": 2.1168,
983
+ "step": 6950
984
+ },
985
+ {
986
+ "epoch": 2.0012459016393445,
987
+ "grad_norm": 0.53515625,
988
+ "learning_rate": 0.00029803220175964675,
989
+ "loss": 2.1439,
990
+ "step": 7000
991
+ },
992
+ {
993
+ "epoch": 2.001792349726776,
994
+ "grad_norm": 0.5,
995
+ "learning_rate": 0.0002979892780126028,
996
+ "loss": 2.0952,
997
+ "step": 7050
998
+ },
999
+ {
1000
+ "epoch": 2.0023387978142075,
1001
+ "grad_norm": 0.490234375,
1002
+ "learning_rate": 0.00029794589431146904,
1003
+ "loss": 2.0817,
1004
+ "step": 7100
1005
+ },
1006
+ {
1007
+ "epoch": 2.0028852459016395,
1008
+ "grad_norm": 0.462890625,
1009
+ "learning_rate": 0.00029790205079108294,
1010
+ "loss": 2.0643,
1011
+ "step": 7150
1012
+ },
1013
+ {
1014
+ "epoch": 2.003431693989071,
1015
+ "grad_norm": 0.55078125,
1016
+ "learning_rate": 0.00029785774758771114,
1017
+ "loss": 2.0993,
1018
+ "step": 7200
1019
+ },
1020
+ {
1021
+ "epoch": 2.0039781420765026,
1022
+ "grad_norm": 0.51953125,
1023
+ "learning_rate": 0.00029781298483904907,
1024
+ "loss": 2.1085,
1025
+ "step": 7250
1026
+ },
1027
+ {
1028
+ "epoch": 2.0045245901639346,
1029
+ "grad_norm": 0.4921875,
1030
+ "learning_rate": 0.0002977677626842204,
1031
+ "loss": 2.0645,
1032
+ "step": 7300
1033
+ },
1034
+ {
1035
+ "epoch": 2.005071038251366,
1036
+ "grad_norm": 0.50390625,
1037
+ "learning_rate": 0.0002977220812637766,
1038
+ "loss": 2.0929,
1039
+ "step": 7350
1040
+ },
1041
+ {
1042
+ "epoch": 2.0056174863387977,
1043
+ "grad_norm": 0.53515625,
1044
+ "learning_rate": 0.0002976759407196966,
1045
+ "loss": 2.0919,
1046
+ "step": 7400
1047
+ },
1048
+ {
1049
+ "epoch": 2.0061639344262296,
1050
+ "grad_norm": 0.55078125,
1051
+ "learning_rate": 0.00029762934119538623,
1052
+ "loss": 2.0903,
1053
+ "step": 7450
1054
+ },
1055
+ {
1056
+ "epoch": 2.006710382513661,
1057
+ "grad_norm": 0.5625,
1058
+ "learning_rate": 0.00029758228283567796,
1059
+ "loss": 2.0481,
1060
+ "step": 7500
1061
+ },
1062
+ {
1063
+ "epoch": 2.0072568306010927,
1064
+ "grad_norm": 0.5703125,
1065
+ "learning_rate": 0.00029753476578683023,
1066
+ "loss": 2.0737,
1067
+ "step": 7550
1068
+ },
1069
+ {
1070
+ "epoch": 2.0078032786885247,
1071
+ "grad_norm": 0.49609375,
1072
+ "learning_rate": 0.00029748679019652704,
1073
+ "loss": 2.1032,
1074
+ "step": 7600
1075
+ },
1076
+ {
1077
+ "epoch": 2.0083497267759562,
1078
+ "grad_norm": 0.5546875,
1079
+ "learning_rate": 0.00029743835621387775,
1080
+ "loss": 2.0722,
1081
+ "step": 7650
1082
+ },
1083
+ {
1084
+ "epoch": 2.008896174863388,
1085
+ "grad_norm": 0.5234375,
1086
+ "learning_rate": 0.00029738946398941623,
1087
+ "loss": 2.057,
1088
+ "step": 7700
1089
+ },
1090
+ {
1091
+ "epoch": 2.0094426229508198,
1092
+ "grad_norm": 0.490234375,
1093
+ "learning_rate": 0.0002973401136751007,
1094
+ "loss": 2.0802,
1095
+ "step": 7750
1096
+ },
1097
+ {
1098
+ "epoch": 2.0099890710382513,
1099
+ "grad_norm": 0.5,
1100
+ "learning_rate": 0.0002972903054243129,
1101
+ "loss": 2.1094,
1102
+ "step": 7800
1103
+ },
1104
+ {
1105
+ "epoch": 2.010535519125683,
1106
+ "grad_norm": 0.515625,
1107
+ "learning_rate": 0.0002972400393918583,
1108
+ "loss": 2.0409,
1109
+ "step": 7850
1110
+ },
1111
+ {
1112
+ "epoch": 2.011081967213115,
1113
+ "grad_norm": 0.578125,
1114
+ "learning_rate": 0.0002971893157339647,
1115
+ "loss": 2.0543,
1116
+ "step": 7900
1117
+ },
1118
+ {
1119
+ "epoch": 2.0116284153005464,
1120
+ "grad_norm": 0.5234375,
1121
+ "learning_rate": 0.0002971381346082824,
1122
+ "loss": 2.0776,
1123
+ "step": 7950
1124
+ },
1125
+ {
1126
+ "epoch": 2.0121748633879784,
1127
+ "grad_norm": 0.55078125,
1128
+ "learning_rate": 0.00029708649617388356,
1129
+ "loss": 2.0629,
1130
+ "step": 8000
1131
  }
1132
  ],
1133
  "logging_steps": 50,
 
1147
  "attributes": {}
1148
  }
1149
  },
1150
+ "total_flos": 4.278236395534811e+18,
1151
  "train_batch_size": 16,
1152
  "trial_name": null,
1153
  "trial_params": null