NBAmine commited on
Commit
91891d0
·
verified ·
1 Parent(s): 21a8f8a

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -32,10 +32,10 @@
32
  "down_proj",
33
  "q_proj",
34
  "o_proj",
35
- "v_proj",
36
- "gate_proj",
37
  "up_proj",
38
- "k_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
32
  "down_proj",
33
  "q_proj",
34
  "o_proj",
35
+ "k_proj",
 
36
  "up_proj",
37
+ "v_proj",
38
+ "gate_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df1c1aa2916d5972f5efeb6284e8eab0c9c72f3782a534e72ebed006d4a326dc
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42057458be0849df210a2b4c2241429197465f786f00b0c91791a8239fe63ce0
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84a3341db1ea0703a3a5bfb4c84d9c6f9c629d7ec814a41d47f66338b4b4316a
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486a933a9db49920bfa89b88f3df33a30e37dd2e0d00f86eab85749749cfb1cd
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c823b36aa64ec6d5ba470435413c8fa628bdc36879db73fd6bcc786691658d3
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9624fb715f3fe663fa916439122fcd0c3a8e903cf9047d070921678e351f1695
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7db9931cd2bdb0cce107e4058673881f0e4939f11f21f05dabe6ed2ca0118fd7
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc94b43794521e81946badd820ca495ec5676bcf0035e98e623d3832e5330ab
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53f409af08acb24ba2f85422d6d830e93fdc97a01268b4582a53eec3cbfeb20a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57dbcaa4c36dfe8b1884cd38afdda1f50d97d5b0660c412d604e987f28a13d71
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 876,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -902,6 +902,458 @@
902
  "eval_samples_per_second": 1.409,
903
  "eval_steps_per_second": 0.353,
904
  "step": 876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
905
  }
906
  ],
907
  "logging_steps": 10,
@@ -921,7 +1373,7 @@
921
  "attributes": {}
922
  }
923
  },
924
- "total_flos": 3.792626248603853e+17,
925
  "train_batch_size": 1,
926
  "trial_name": null,
927
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 1314,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
902
  "eval_samples_per_second": 1.409,
903
  "eval_steps_per_second": 0.353,
904
  "step": 876
905
+ },
906
+ {
907
+ "entropy": 0.07226648146752268,
908
+ "epoch": 2.0091428571428573,
909
+ "grad_norm": 0.19992968440055847,
910
+ "learning_rate": 5.990867579908676e-06,
911
+ "loss": 0.0549,
912
+ "mean_token_accuracy": 0.9871370224282146,
913
+ "num_tokens": 71651.0,
914
+ "step": 880
915
+ },
916
+ {
917
+ "entropy": 0.08760866427328437,
918
+ "epoch": 2.032,
919
+ "grad_norm": 1.5932085514068604,
920
+ "learning_rate": 5.945205479452055e-06,
921
+ "loss": 0.0778,
922
+ "mean_token_accuracy": 0.9797730926424265,
923
+ "num_tokens": 172012.0,
924
+ "step": 890
925
+ },
926
+ {
927
+ "entropy": 0.12492547524161637,
928
+ "epoch": 2.0548571428571427,
929
+ "grad_norm": 0.6689738035202026,
930
+ "learning_rate": 5.8995433789954336e-06,
931
+ "loss": 0.115,
932
+ "mean_token_accuracy": 0.9695353880524635,
933
+ "num_tokens": 232243.0,
934
+ "step": 900
935
+ },
936
+ {
937
+ "entropy": 0.16423010914586483,
938
+ "epoch": 2.077714285714286,
939
+ "grad_norm": 0.5557848811149597,
940
+ "learning_rate": 5.853881278538813e-06,
941
+ "loss": 0.1504,
942
+ "mean_token_accuracy": 0.960123248770833,
943
+ "num_tokens": 271484.0,
944
+ "step": 910
945
+ },
946
+ {
947
+ "entropy": 0.2013902359176427,
948
+ "epoch": 2.1005714285714285,
949
+ "grad_norm": 0.6008731126785278,
950
+ "learning_rate": 5.8082191780821915e-06,
951
+ "loss": 0.185,
952
+ "mean_token_accuracy": 0.9510148607194424,
953
+ "num_tokens": 300041.0,
954
+ "step": 920
955
+ },
956
+ {
957
+ "entropy": 0.16232432541437447,
958
+ "epoch": 2.123428571428571,
959
+ "grad_norm": 0.2098476141691208,
960
+ "learning_rate": 5.762557077625572e-06,
961
+ "loss": 0.1412,
962
+ "mean_token_accuracy": 0.9628485467284917,
963
+ "num_tokens": 389965.0,
964
+ "step": 930
965
+ },
966
+ {
967
+ "entropy": 0.08904013778083027,
968
+ "epoch": 2.1462857142857144,
969
+ "grad_norm": 0.3195517659187317,
970
+ "learning_rate": 5.716894977168949e-06,
971
+ "loss": 0.0845,
972
+ "mean_token_accuracy": 0.9784366983920336,
973
+ "num_tokens": 487704.0,
974
+ "step": 940
975
+ },
976
+ {
977
+ "entropy": 0.12201522623654455,
978
+ "epoch": 2.169142857142857,
979
+ "grad_norm": 1.0207574367523193,
980
+ "learning_rate": 5.6712328767123296e-06,
981
+ "loss": 0.1133,
982
+ "mean_token_accuracy": 0.9702005807310343,
983
+ "num_tokens": 546009.0,
984
+ "step": 950
985
+ },
986
+ {
987
+ "entropy": 0.16437453916296363,
988
+ "epoch": 2.192,
989
+ "grad_norm": 1.6780635118484497,
990
+ "learning_rate": 5.625570776255708e-06,
991
+ "loss": 0.1542,
992
+ "mean_token_accuracy": 0.9600558575242758,
993
+ "num_tokens": 585430.0,
994
+ "step": 960
995
+ },
996
+ {
997
+ "entropy": 0.1969488083384931,
998
+ "epoch": 2.214857142857143,
999
+ "grad_norm": 0.6065514087677002,
1000
+ "learning_rate": 5.5799086757990874e-06,
1001
+ "loss": 0.1817,
1002
+ "mean_token_accuracy": 0.9518488951027393,
1003
+ "num_tokens": 614326.0,
1004
+ "step": 970
1005
+ },
1006
+ {
1007
+ "entropy": 0.17060820223996415,
1008
+ "epoch": 2.2377142857142855,
1009
+ "grad_norm": 0.7302483916282654,
1010
+ "learning_rate": 5.534246575342466e-06,
1011
+ "loss": 0.1466,
1012
+ "mean_token_accuracy": 0.9602311763912439,
1013
+ "num_tokens": 695616.0,
1014
+ "step": 980
1015
+ },
1016
+ {
1017
+ "entropy": 0.09251747198868543,
1018
+ "epoch": 2.2605714285714287,
1019
+ "grad_norm": 0.28182512521743774,
1020
+ "learning_rate": 5.488584474885845e-06,
1021
+ "loss": 0.0865,
1022
+ "mean_token_accuracy": 0.9779592745006085,
1023
+ "num_tokens": 785741.0,
1024
+ "step": 990
1025
+ },
1026
+ {
1027
+ "entropy": 0.11913358124438674,
1028
+ "epoch": 2.2834285714285714,
1029
+ "grad_norm": 0.567164421081543,
1030
+ "learning_rate": 5.442922374429224e-06,
1031
+ "loss": 0.112,
1032
+ "mean_token_accuracy": 0.9705987706780433,
1033
+ "num_tokens": 841989.0,
1034
+ "step": 1000
1035
+ },
1036
+ {
1037
+ "entropy": 0.1637257631868124,
1038
+ "epoch": 2.306285714285714,
1039
+ "grad_norm": 0.8375981450080872,
1040
+ "learning_rate": 5.397260273972603e-06,
1041
+ "loss": 0.1514,
1042
+ "mean_token_accuracy": 0.9596097219735384,
1043
+ "num_tokens": 880259.0,
1044
+ "step": 1010
1045
+ },
1046
+ {
1047
+ "entropy": 0.20183823076076807,
1048
+ "epoch": 2.329142857142857,
1049
+ "grad_norm": 0.5665499567985535,
1050
+ "learning_rate": 5.351598173515982e-06,
1051
+ "loss": 0.1865,
1052
+ "mean_token_accuracy": 0.9503094878047704,
1053
+ "num_tokens": 908910.0,
1054
+ "step": 1020
1055
+ },
1056
+ {
1057
+ "entropy": 0.14882559089455755,
1058
+ "epoch": 2.352,
1059
+ "grad_norm": 0.21251143515110016,
1060
+ "learning_rate": 5.310502283105024e-06,
1061
+ "loss": 0.1316,
1062
+ "mean_token_accuracy": 0.9661899615079165,
1063
+ "num_tokens": 996048.0,
1064
+ "step": 1030
1065
+ },
1066
+ {
1067
+ "entropy": 0.08644997659139335,
1068
+ "epoch": 2.374857142857143,
1069
+ "grad_norm": 0.35939541459083557,
1070
+ "learning_rate": 5.264840182648402e-06,
1071
+ "loss": 0.0803,
1072
+ "mean_token_accuracy": 0.9792920105159283,
1073
+ "num_tokens": 1095327.0,
1074
+ "step": 1040
1075
+ },
1076
+ {
1077
+ "entropy": 0.12371540726162493,
1078
+ "epoch": 2.3977142857142857,
1079
+ "grad_norm": 0.4109712839126587,
1080
+ "learning_rate": 5.219178082191782e-06,
1081
+ "loss": 0.1155,
1082
+ "mean_token_accuracy": 0.9702008839696645,
1083
+ "num_tokens": 1153028.0,
1084
+ "step": 1050
1085
+ },
1086
+ {
1087
+ "entropy": 0.16861008908599615,
1088
+ "epoch": 2.420571428571429,
1089
+ "grad_norm": 0.4903920590877533,
1090
+ "learning_rate": 5.17351598173516e-06,
1091
+ "loss": 0.1553,
1092
+ "mean_token_accuracy": 0.9577290844172239,
1093
+ "num_tokens": 1191216.0,
1094
+ "step": 1060
1095
+ },
1096
+ {
1097
+ "entropy": 0.1997508300933987,
1098
+ "epoch": 2.4434285714285715,
1099
+ "grad_norm": 0.5626015663146973,
1100
+ "learning_rate": 5.1278538812785395e-06,
1101
+ "loss": 0.1836,
1102
+ "mean_token_accuracy": 0.950419794023037,
1103
+ "num_tokens": 1219726.0,
1104
+ "step": 1070
1105
+ },
1106
+ {
1107
+ "entropy": 0.1611965524731204,
1108
+ "epoch": 2.466285714285714,
1109
+ "grad_norm": 0.2858760356903076,
1110
+ "learning_rate": 5.082191780821918e-06,
1111
+ "loss": 0.1406,
1112
+ "mean_token_accuracy": 0.9622184198349715,
1113
+ "num_tokens": 1307694.0,
1114
+ "step": 1080
1115
+ },
1116
+ {
1117
+ "entropy": 0.08963021833915263,
1118
+ "epoch": 2.4891428571428573,
1119
+ "grad_norm": 0.380759060382843,
1120
+ "learning_rate": 5.0365296803652974e-06,
1121
+ "loss": 0.0836,
1122
+ "mean_token_accuracy": 0.9787446241825819,
1123
+ "num_tokens": 1403155.0,
1124
+ "step": 1090
1125
+ },
1126
+ {
1127
+ "entropy": 0.12443877512123436,
1128
+ "epoch": 2.512,
1129
+ "grad_norm": 0.33159124851226807,
1130
+ "learning_rate": 4.990867579908677e-06,
1131
+ "loss": 0.1144,
1132
+ "mean_token_accuracy": 0.9694507587701082,
1133
+ "num_tokens": 1461083.0,
1134
+ "step": 1100
1135
+ },
1136
+ {
1137
+ "entropy": 0.15655358280055226,
1138
+ "epoch": 2.5348571428571427,
1139
+ "grad_norm": 0.4750528335571289,
1140
+ "learning_rate": 4.945205479452055e-06,
1141
+ "loss": 0.1464,
1142
+ "mean_token_accuracy": 0.9608835749328136,
1143
+ "num_tokens": 1499200.0,
1144
+ "step": 1110
1145
+ },
1146
+ {
1147
+ "entropy": 0.18584296074695886,
1148
+ "epoch": 2.557714285714286,
1149
+ "grad_norm": 0.5686924457550049,
1150
+ "learning_rate": 4.899543378995435e-06,
1151
+ "loss": 0.1675,
1152
+ "mean_token_accuracy": 0.9554672811180354,
1153
+ "num_tokens": 1527156.0,
1154
+ "step": 1120
1155
+ },
1156
+ {
1157
+ "entropy": 0.1512902246438898,
1158
+ "epoch": 2.5805714285714285,
1159
+ "grad_norm": 0.36698049306869507,
1160
+ "learning_rate": 4.853881278538813e-06,
1161
+ "loss": 0.1351,
1162
+ "mean_token_accuracy": 0.9630210023373366,
1163
+ "num_tokens": 1604124.0,
1164
+ "step": 1130
1165
+ },
1166
+ {
1167
+ "entropy": 0.08813798553310334,
1168
+ "epoch": 2.603428571428571,
1169
+ "grad_norm": 0.28617146611213684,
1170
+ "learning_rate": 4.8082191780821926e-06,
1171
+ "loss": 0.0862,
1172
+ "mean_token_accuracy": 0.9784242223948241,
1173
+ "num_tokens": 1697131.0,
1174
+ "step": 1140
1175
+ },
1176
+ {
1177
+ "entropy": 0.12058792433235795,
1178
+ "epoch": 2.6262857142857143,
1179
+ "grad_norm": 0.3608751893043518,
1180
+ "learning_rate": 4.762557077625571e-06,
1181
+ "loss": 0.1096,
1182
+ "mean_token_accuracy": 0.9710184000432491,
1183
+ "num_tokens": 1754378.0,
1184
+ "step": 1150
1185
+ },
1186
+ {
1187
+ "entropy": 0.15165836540982128,
1188
+ "epoch": 2.649142857142857,
1189
+ "grad_norm": 0.4203783869743347,
1190
+ "learning_rate": 4.7168949771689505e-06,
1191
+ "loss": 0.1378,
1192
+ "mean_token_accuracy": 0.9628069128841161,
1193
+ "num_tokens": 1793014.0,
1194
+ "step": 1160
1195
+ },
1196
+ {
1197
+ "entropy": 0.1915775102097541,
1198
+ "epoch": 2.672,
1199
+ "grad_norm": 0.5867162942886353,
1200
+ "learning_rate": 4.671232876712329e-06,
1201
+ "loss": 0.1772,
1202
+ "mean_token_accuracy": 0.9522358998656273,
1203
+ "num_tokens": 1821604.0,
1204
+ "step": 1170
1205
+ },
1206
+ {
1207
+ "entropy": 0.15136512140743436,
1208
+ "epoch": 2.694857142857143,
1209
+ "grad_norm": 0.18969348073005676,
1210
+ "learning_rate": 4.625570776255708e-06,
1211
+ "loss": 0.1358,
1212
+ "mean_token_accuracy": 0.9648103080689907,
1213
+ "num_tokens": 1908343.0,
1214
+ "step": 1180
1215
+ },
1216
+ {
1217
+ "entropy": 0.08064724097494036,
1218
+ "epoch": 2.717714285714286,
1219
+ "grad_norm": 0.4214267432689667,
1220
+ "learning_rate": 4.579908675799088e-06,
1221
+ "loss": 0.0742,
1222
+ "mean_token_accuracy": 0.9811519052833318,
1223
+ "num_tokens": 2005050.0,
1224
+ "step": 1190
1225
+ },
1226
+ {
1227
+ "entropy": 0.11832776879891753,
1228
+ "epoch": 2.7405714285714287,
1229
+ "grad_norm": 0.35349538922309875,
1230
+ "learning_rate": 4.534246575342466e-06,
1231
+ "loss": 0.1076,
1232
+ "mean_token_accuracy": 0.9710629042237997,
1233
+ "num_tokens": 2064659.0,
1234
+ "step": 1200
1235
+ },
1236
+ {
1237
+ "entropy": 0.16044411729089916,
1238
+ "epoch": 2.7634285714285713,
1239
+ "grad_norm": 0.5639057159423828,
1240
+ "learning_rate": 4.488584474885846e-06,
1241
+ "loss": 0.1506,
1242
+ "mean_token_accuracy": 0.9597010012716055,
1243
+ "num_tokens": 2104172.0,
1244
+ "step": 1210
1245
+ },
1246
+ {
1247
+ "entropy": 0.18409276246093215,
1248
+ "epoch": 2.7862857142857145,
1249
+ "grad_norm": 0.6648959517478943,
1250
+ "learning_rate": 4.442922374429224e-06,
1251
+ "loss": 0.1677,
1252
+ "mean_token_accuracy": 0.9542941998690366,
1253
+ "num_tokens": 2132500.0,
1254
+ "step": 1220
1255
+ },
1256
+ {
1257
+ "entropy": 0.14733070600777864,
1258
+ "epoch": 2.809142857142857,
1259
+ "grad_norm": 0.19638575613498688,
1260
+ "learning_rate": 4.3972602739726035e-06,
1261
+ "loss": 0.131,
1262
+ "mean_token_accuracy": 0.9638711795210838,
1263
+ "num_tokens": 2223474.0,
1264
+ "step": 1230
1265
+ },
1266
+ {
1267
+ "entropy": 0.07547924700193107,
1268
+ "epoch": 2.832,
1269
+ "grad_norm": 0.3911222815513611,
1270
+ "learning_rate": 4.351598173515982e-06,
1271
+ "loss": 0.0689,
1272
+ "mean_token_accuracy": 0.9817693259567022,
1273
+ "num_tokens": 2326229.0,
1274
+ "step": 1240
1275
+ },
1276
+ {
1277
+ "entropy": 0.11080959427636117,
1278
+ "epoch": 2.854857142857143,
1279
+ "grad_norm": 0.39502909779548645,
1280
+ "learning_rate": 4.305936073059361e-06,
1281
+ "loss": 0.1041,
1282
+ "mean_token_accuracy": 0.9721482455730438,
1283
+ "num_tokens": 2386201.0,
1284
+ "step": 1250
1285
+ },
1286
+ {
1287
+ "entropy": 0.15536890965886413,
1288
+ "epoch": 2.8777142857142857,
1289
+ "grad_norm": 0.5055193901062012,
1290
+ "learning_rate": 4.260273972602741e-06,
1291
+ "loss": 0.1455,
1292
+ "mean_token_accuracy": 0.9603518169373274,
1293
+ "num_tokens": 2426168.0,
1294
+ "step": 1260
1295
+ },
1296
+ {
1297
+ "entropy": 0.17867730939760804,
1298
+ "epoch": 2.9005714285714284,
1299
+ "grad_norm": 0.571367621421814,
1300
+ "learning_rate": 4.214611872146119e-06,
1301
+ "loss": 0.1641,
1302
+ "mean_token_accuracy": 0.9538291383534669,
1303
+ "num_tokens": 2455084.0,
1304
+ "step": 1270
1305
+ },
1306
+ {
1307
+ "entropy": 0.15099742623278872,
1308
+ "epoch": 2.9234285714285715,
1309
+ "grad_norm": 0.28434649109840393,
1310
+ "learning_rate": 4.168949771689499e-06,
1311
+ "loss": 0.1345,
1312
+ "mean_token_accuracy": 0.9636810082942248,
1313
+ "num_tokens": 2539444.0,
1314
+ "step": 1280
1315
+ },
1316
+ {
1317
+ "entropy": 0.09173946799710393,
1318
+ "epoch": 2.946285714285714,
1319
+ "grad_norm": 0.29292547702789307,
1320
+ "learning_rate": 4.123287671232877e-06,
1321
+ "loss": 0.0867,
1322
+ "mean_token_accuracy": 0.9774239655584097,
1323
+ "num_tokens": 2629661.0,
1324
+ "step": 1290
1325
+ },
1326
+ {
1327
+ "entropy": 0.13389600275550037,
1328
+ "epoch": 2.9691428571428573,
1329
+ "grad_norm": 0.4315743148326874,
1330
+ "learning_rate": 4.0776255707762565e-06,
1331
+ "loss": 0.1248,
1332
+ "mean_token_accuracy": 0.9657621275633573,
1333
+ "num_tokens": 2678664.0,
1334
+ "step": 1300
1335
+ },
1336
+ {
1337
+ "entropy": 0.1770935676060617,
1338
+ "epoch": 2.992,
1339
+ "grad_norm": 0.5184078216552734,
1340
+ "learning_rate": 4.031963470319635e-06,
1341
+ "loss": 0.1654,
1342
+ "mean_token_accuracy": 0.9563698008656502,
1343
+ "num_tokens": 2709664.0,
1344
+ "step": 1310
1345
+ },
1346
+ {
1347
+ "epoch": 3.0,
1348
+ "eval_accuracy": 0.001526303634008455,
1349
+ "eval_entropy": 0.3425323714620819,
1350
+ "eval_loss": 1.1199185848236084,
1351
+ "eval_mean_token_accuracy": 0.8341653729037428,
1352
+ "eval_num_tokens": 2716693.0,
1353
+ "eval_runtime": 784.2466,
1354
+ "eval_samples_per_second": 1.318,
1355
+ "eval_steps_per_second": 0.33,
1356
+ "step": 1314
1357
  }
1358
  ],
1359
  "logging_steps": 10,
 
1373
  "attributes": {}
1374
  }
1375
  },
1376
+ "total_flos": 5.688939372905779e+17,
1377
  "train_batch_size": 1,
1378
  "trial_name": null,
1379
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd36270ff585b2d668c6df7d7ada51207c25255f0fc66fa207d06a8a67152786
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1b103633cf7c9962527dcf216e434ddad474edf117eac5e9f686412165c6b7
3
  size 6353