Sabbir772 commited on
Commit
9ca4aaf
·
verified ·
1 Parent(s): 1099778

Training in progress, step 17405, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:470ceab2ffbdc9be7e9ba55dac8f66ad2d2ee6d83bf68d588bdf8c2e363afa0a
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f53a70c963db68f9b694dda2a1e3c2e11ceee1090f7c14aa12ae702859e59120
3
  size 990185320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a8f1de75a1e7f597eeeec866f696144c9ded9443ee76a77efdde665cb83edb3
3
  size 1980545291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb559ee87caf15c8957d8760368f094fecca187637f61060b813167bb93e1f7
3
  size 1980545291
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ce3825e60923efd0732224de480af38290fa16b941f44ff5e3049ad2c6cac56
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40f4b20a69e9316fdba91122a6236379e60cdc702f2695921644b9e637ab8165
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3185a876acce70348de92f6615e3c6174f04c918e17668da9430678af0491872
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b004d308497203f25cc61e28f5460f74f23fc3afe51c9ea0caca14c1845f09f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.987647227808101,
6
  "eval_steps": 400,
7
- "global_step": 10400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -996,12 +996,672 @@
996
  "eval_samples_per_second": 8.418,
997
  "eval_steps_per_second": 1.052,
998
  "step": 10400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  }
1000
  ],
1001
  "logging_steps": 100,
1002
- "max_steps": 13924,
1003
  "num_input_tokens_seen": 0,
1004
- "num_train_epochs": 4,
1005
  "save_steps": 400,
1006
  "stateful_callbacks": {
1007
  "TrainerControl": {
@@ -1010,12 +1670,12 @@
1010
  "should_evaluate": false,
1011
  "should_log": false,
1012
  "should_save": true,
1013
- "should_training_stop": false
1014
  },
1015
  "attributes": {}
1016
  }
1017
  },
1018
- "total_flos": 2.8481299959250944e+16,
1019
  "train_batch_size": 8,
1020
  "trial_name": null,
1021
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
  "eval_steps": 400,
7
+ "global_step": 17405,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
996
  "eval_samples_per_second": 8.418,
997
  "eval_steps_per_second": 1.052,
998
  "step": 10400
999
+ },
1000
+ {
1001
+ "epoch": 3.0163746049985636,
1002
+ "grad_norm": 7.44285774230957,
1003
+ "learning_rate": 1.3887388681413387e-05,
1004
+ "loss": 0.92,
1005
+ "step": 10500
1006
+ },
1007
+ {
1008
+ "epoch": 3.045101982189026,
1009
+ "grad_norm": 6.678479194641113,
1010
+ "learning_rate": 1.3686297041080149e-05,
1011
+ "loss": 0.9852,
1012
+ "step": 10600
1013
+ },
1014
+ {
1015
+ "epoch": 3.0738293593794888,
1016
+ "grad_norm": 4.869193077087402,
1017
+ "learning_rate": 1.3485205400746911e-05,
1018
+ "loss": 0.9608,
1019
+ "step": 10700
1020
+ },
1021
+ {
1022
+ "epoch": 3.102556736569951,
1023
+ "grad_norm": 9.369580268859863,
1024
+ "learning_rate": 1.3284113760413673e-05,
1025
+ "loss": 1.0157,
1026
+ "step": 10800
1027
+ },
1028
+ {
1029
+ "epoch": 3.102556736569951,
1030
+ "eval_bleu": 45.22945134261269,
1031
+ "eval_chrf": 69.63638434182933,
1032
+ "eval_loss": 0.986176609992981,
1033
+ "eval_runtime": 96.7623,
1034
+ "eval_samples_per_second": 8.433,
1035
+ "eval_steps_per_second": 1.054,
1036
+ "step": 10800
1037
+ },
1038
+ {
1039
+ "epoch": 3.1312841137604135,
1040
+ "grad_norm": 3.69415020942688,
1041
+ "learning_rate": 1.3083022120080436e-05,
1042
+ "loss": 0.9119,
1043
+ "step": 10900
1044
+ },
1045
+ {
1046
+ "epoch": 3.1600114909508763,
1047
+ "grad_norm": 3.1986947059631348,
1048
+ "learning_rate": 1.2881930479747198e-05,
1049
+ "loss": 0.9571,
1050
+ "step": 11000
1051
+ },
1052
+ {
1053
+ "epoch": 3.1887388681413387,
1054
+ "grad_norm": 6.022769451141357,
1055
+ "learning_rate": 1.268083883941396e-05,
1056
+ "loss": 0.9792,
1057
+ "step": 11100
1058
+ },
1059
+ {
1060
+ "epoch": 3.217466245331801,
1061
+ "grad_norm": 5.501678943634033,
1062
+ "learning_rate": 1.2479747199080722e-05,
1063
+ "loss": 0.9616,
1064
+ "step": 11200
1065
+ },
1066
+ {
1067
+ "epoch": 3.217466245331801,
1068
+ "eval_bleu": 44.86788489369196,
1069
+ "eval_chrf": 69.67814444050612,
1070
+ "eval_loss": 0.9908636212348938,
1071
+ "eval_runtime": 97.0809,
1072
+ "eval_samples_per_second": 8.405,
1073
+ "eval_steps_per_second": 1.051,
1074
+ "step": 11200
1075
+ },
1076
+ {
1077
+ "epoch": 3.246193622522264,
1078
+ "grad_norm": 5.359325408935547,
1079
+ "learning_rate": 1.2278655558747484e-05,
1080
+ "loss": 0.9402,
1081
+ "step": 11300
1082
+ },
1083
+ {
1084
+ "epoch": 3.274920999712726,
1085
+ "grad_norm": 5.0644612312316895,
1086
+ "learning_rate": 1.2077563918414248e-05,
1087
+ "loss": 0.9357,
1088
+ "step": 11400
1089
+ },
1090
+ {
1091
+ "epoch": 3.3036483769031886,
1092
+ "grad_norm": 3.8930866718292236,
1093
+ "learning_rate": 1.187647227808101e-05,
1094
+ "loss": 0.9476,
1095
+ "step": 11500
1096
+ },
1097
+ {
1098
+ "epoch": 3.3323757540936514,
1099
+ "grad_norm": 2.5604605674743652,
1100
+ "learning_rate": 1.1675380637747773e-05,
1101
+ "loss": 0.9173,
1102
+ "step": 11600
1103
+ },
1104
+ {
1105
+ "epoch": 3.3323757540936514,
1106
+ "eval_bleu": 45.52285105667052,
1107
+ "eval_chrf": 69.98913215954698,
1108
+ "eval_loss": 0.9875417947769165,
1109
+ "eval_runtime": 96.2372,
1110
+ "eval_samples_per_second": 8.479,
1111
+ "eval_steps_per_second": 1.06,
1112
+ "step": 11600
1113
+ },
1114
+ {
1115
+ "epoch": 3.3611031312841138,
1116
+ "grad_norm": 3.812286853790283,
1117
+ "learning_rate": 1.1474288997414535e-05,
1118
+ "loss": 0.9624,
1119
+ "step": 11700
1120
+ },
1121
+ {
1122
+ "epoch": 3.389830508474576,
1123
+ "grad_norm": 4.970190048217773,
1124
+ "learning_rate": 1.1273197357081299e-05,
1125
+ "loss": 0.9315,
1126
+ "step": 11800
1127
+ },
1128
+ {
1129
+ "epoch": 3.418557885665039,
1130
+ "grad_norm": 4.663548469543457,
1131
+ "learning_rate": 1.107210571674806e-05,
1132
+ "loss": 0.9257,
1133
+ "step": 11900
1134
+ },
1135
+ {
1136
+ "epoch": 3.4472852628555013,
1137
+ "grad_norm": 5.273872375488281,
1138
+ "learning_rate": 1.0871014076414823e-05,
1139
+ "loss": 0.968,
1140
+ "step": 12000
1141
+ },
1142
+ {
1143
+ "epoch": 3.4472852628555013,
1144
+ "eval_bleu": 45.32133917023191,
1145
+ "eval_chrf": 69.85397927785405,
1146
+ "eval_loss": 0.9833332896232605,
1147
+ "eval_runtime": 96.1663,
1148
+ "eval_samples_per_second": 8.485,
1149
+ "eval_steps_per_second": 1.061,
1150
+ "step": 12000
1151
+ },
1152
+ {
1153
+ "epoch": 3.4760126400459637,
1154
+ "grad_norm": 3.624070882797241,
1155
+ "learning_rate": 1.0669922436081585e-05,
1156
+ "loss": 0.9552,
1157
+ "step": 12100
1158
+ },
1159
+ {
1160
+ "epoch": 3.5047400172364265,
1161
+ "grad_norm": 3.9386630058288574,
1162
+ "learning_rate": 1.0468830795748347e-05,
1163
+ "loss": 0.9975,
1164
+ "step": 12200
1165
+ },
1166
+ {
1167
+ "epoch": 3.533467394426889,
1168
+ "grad_norm": 3.272144317626953,
1169
+ "learning_rate": 1.026773915541511e-05,
1170
+ "loss": 0.8777,
1171
+ "step": 12300
1172
+ },
1173
+ {
1174
+ "epoch": 3.562194771617351,
1175
+ "grad_norm": 4.805718898773193,
1176
+ "learning_rate": 1.0066647515081872e-05,
1177
+ "loss": 0.9949,
1178
+ "step": 12400
1179
+ },
1180
+ {
1181
+ "epoch": 3.562194771617351,
1182
+ "eval_bleu": 45.55482284185368,
1183
+ "eval_chrf": 69.95827933302967,
1184
+ "eval_loss": 0.9825245141983032,
1185
+ "eval_runtime": 96.6388,
1186
+ "eval_samples_per_second": 8.444,
1187
+ "eval_steps_per_second": 1.055,
1188
+ "step": 12400
1189
+ },
1190
+ {
1191
+ "epoch": 3.590922148807814,
1192
+ "grad_norm": 4.920676231384277,
1193
+ "learning_rate": 9.865555874748634e-06,
1194
+ "loss": 1.0276,
1195
+ "step": 12500
1196
+ },
1197
+ {
1198
+ "epoch": 3.6196495259982764,
1199
+ "grad_norm": 6.698337554931641,
1200
+ "learning_rate": 9.664464234415396e-06,
1201
+ "loss": 0.9667,
1202
+ "step": 12600
1203
+ },
1204
+ {
1205
+ "epoch": 3.6483769031887388,
1206
+ "grad_norm": 4.400453567504883,
1207
+ "learning_rate": 9.46337259408216e-06,
1208
+ "loss": 0.9661,
1209
+ "step": 12700
1210
+ },
1211
+ {
1212
+ "epoch": 3.6771042803792016,
1213
+ "grad_norm": 4.711784839630127,
1214
+ "learning_rate": 9.262280953748922e-06,
1215
+ "loss": 0.94,
1216
+ "step": 12800
1217
+ },
1218
+ {
1219
+ "epoch": 3.6771042803792016,
1220
+ "eval_bleu": 45.46850521898221,
1221
+ "eval_chrf": 69.87260183216233,
1222
+ "eval_loss": 0.984427809715271,
1223
+ "eval_runtime": 96.1804,
1224
+ "eval_samples_per_second": 8.484,
1225
+ "eval_steps_per_second": 1.061,
1226
+ "step": 12800
1227
+ },
1228
+ {
1229
+ "epoch": 3.705831657569664,
1230
+ "grad_norm": 5.415602207183838,
1231
+ "learning_rate": 9.061189313415684e-06,
1232
+ "loss": 0.9306,
1233
+ "step": 12900
1234
+ },
1235
+ {
1236
+ "epoch": 3.7345590347601263,
1237
+ "grad_norm": 4.637047290802002,
1238
+ "learning_rate": 8.860097673082446e-06,
1239
+ "loss": 0.923,
1240
+ "step": 13000
1241
+ },
1242
+ {
1243
+ "epoch": 3.763286411950589,
1244
+ "grad_norm": 3.9064526557922363,
1245
+ "learning_rate": 8.65900603274921e-06,
1246
+ "loss": 0.9034,
1247
+ "step": 13100
1248
+ },
1249
+ {
1250
+ "epoch": 3.7920137891410515,
1251
+ "grad_norm": 4.590717792510986,
1252
+ "learning_rate": 8.457914392415972e-06,
1253
+ "loss": 0.9283,
1254
+ "step": 13200
1255
+ },
1256
+ {
1257
+ "epoch": 3.7920137891410515,
1258
+ "eval_bleu": 45.36403316661577,
1259
+ "eval_chrf": 69.79708022162536,
1260
+ "eval_loss": 0.9839698672294617,
1261
+ "eval_runtime": 96.5576,
1262
+ "eval_samples_per_second": 8.451,
1263
+ "eval_steps_per_second": 1.056,
1264
+ "step": 13200
1265
+ },
1266
+ {
1267
+ "epoch": 3.820741166331514,
1268
+ "grad_norm": 3.184018611907959,
1269
+ "learning_rate": 8.256822752082735e-06,
1270
+ "loss": 0.9431,
1271
+ "step": 13300
1272
+ },
1273
+ {
1274
+ "epoch": 3.8494685435219766,
1275
+ "grad_norm": 5.2427449226379395,
1276
+ "learning_rate": 8.055731111749497e-06,
1277
+ "loss": 0.9175,
1278
+ "step": 13400
1279
+ },
1280
+ {
1281
+ "epoch": 3.878195920712439,
1282
+ "grad_norm": 4.087272644042969,
1283
+ "learning_rate": 7.854639471416259e-06,
1284
+ "loss": 0.9204,
1285
+ "step": 13500
1286
+ },
1287
+ {
1288
+ "epoch": 3.9069232979029014,
1289
+ "grad_norm": 3.3691282272338867,
1290
+ "learning_rate": 7.653547831083021e-06,
1291
+ "loss": 0.9239,
1292
+ "step": 13600
1293
+ },
1294
+ {
1295
+ "epoch": 3.9069232979029014,
1296
+ "eval_bleu": 45.45276331869471,
1297
+ "eval_chrf": 69.95136404450032,
1298
+ "eval_loss": 0.9811603426933289,
1299
+ "eval_runtime": 96.502,
1300
+ "eval_samples_per_second": 8.456,
1301
+ "eval_steps_per_second": 1.057,
1302
+ "step": 13600
1303
+ },
1304
+ {
1305
+ "epoch": 3.935650675093364,
1306
+ "grad_norm": 3.8877460956573486,
1307
+ "learning_rate": 7.452456190749784e-06,
1308
+ "loss": 0.9371,
1309
+ "step": 13700
1310
+ },
1311
+ {
1312
+ "epoch": 3.9643780522838266,
1313
+ "grad_norm": 3.848369836807251,
1314
+ "learning_rate": 7.251364550416546e-06,
1315
+ "loss": 0.9284,
1316
+ "step": 13800
1317
+ },
1318
+ {
1319
+ "epoch": 3.993105429474289,
1320
+ "grad_norm": 3.9678354263305664,
1321
+ "learning_rate": 7.0502729100833085e-06,
1322
+ "loss": 0.9303,
1323
+ "step": 13900
1324
+ },
1325
+ {
1326
+ "epoch": 4.021832806664752,
1327
+ "grad_norm": 3.6919009685516357,
1328
+ "learning_rate": 6.849181269750071e-06,
1329
+ "loss": 0.926,
1330
+ "step": 14000
1331
+ },
1332
+ {
1333
+ "epoch": 4.021832806664752,
1334
+ "eval_bleu": 45.58281536674216,
1335
+ "eval_chrf": 69.89625294733818,
1336
+ "eval_loss": 0.9837493896484375,
1337
+ "eval_runtime": 96.3768,
1338
+ "eval_samples_per_second": 8.467,
1339
+ "eval_steps_per_second": 1.058,
1340
+ "step": 14000
1341
+ },
1342
+ {
1343
+ "epoch": 4.050560183855214,
1344
+ "grad_norm": 4.469439506530762,
1345
+ "learning_rate": 6.648089629416834e-06,
1346
+ "loss": 0.9154,
1347
+ "step": 14100
1348
+ },
1349
+ {
1350
+ "epoch": 4.0792875610456765,
1351
+ "grad_norm": 3.113417148590088,
1352
+ "learning_rate": 6.446997989083597e-06,
1353
+ "loss": 0.9142,
1354
+ "step": 14200
1355
+ },
1356
+ {
1357
+ "epoch": 4.108014938236139,
1358
+ "grad_norm": 4.672985076904297,
1359
+ "learning_rate": 6.245906348750359e-06,
1360
+ "loss": 0.9569,
1361
+ "step": 14300
1362
+ },
1363
+ {
1364
+ "epoch": 4.136742315426601,
1365
+ "grad_norm": 3.867658853530884,
1366
+ "learning_rate": 6.044814708417121e-06,
1367
+ "loss": 0.9083,
1368
+ "step": 14400
1369
+ },
1370
+ {
1371
+ "epoch": 4.136742315426601,
1372
+ "eval_bleu": 45.69719220756097,
1373
+ "eval_chrf": 70.00557701215436,
1374
+ "eval_loss": 0.9808344841003418,
1375
+ "eval_runtime": 95.9409,
1376
+ "eval_samples_per_second": 8.505,
1377
+ "eval_steps_per_second": 1.063,
1378
+ "step": 14400
1379
+ },
1380
+ {
1381
+ "epoch": 4.165469692617064,
1382
+ "grad_norm": 5.639285087585449,
1383
+ "learning_rate": 5.843723068083883e-06,
1384
+ "loss": 0.955,
1385
+ "step": 14500
1386
+ },
1387
+ {
1388
+ "epoch": 4.194197069807527,
1389
+ "grad_norm": 6.473118305206299,
1390
+ "learning_rate": 5.642631427750645e-06,
1391
+ "loss": 0.908,
1392
+ "step": 14600
1393
+ },
1394
+ {
1395
+ "epoch": 4.222924446997989,
1396
+ "grad_norm": 3.8311145305633545,
1397
+ "learning_rate": 5.441539787417408e-06,
1398
+ "loss": 0.8807,
1399
+ "step": 14700
1400
+ },
1401
+ {
1402
+ "epoch": 4.251651824188452,
1403
+ "grad_norm": 4.5824713706970215,
1404
+ "learning_rate": 5.2404481470841715e-06,
1405
+ "loss": 0.9582,
1406
+ "step": 14800
1407
+ },
1408
+ {
1409
+ "epoch": 4.251651824188452,
1410
+ "eval_bleu": 45.491128667326144,
1411
+ "eval_chrf": 69.78846486375005,
1412
+ "eval_loss": 0.9811265468597412,
1413
+ "eval_runtime": 95.7284,
1414
+ "eval_samples_per_second": 8.524,
1415
+ "eval_steps_per_second": 1.066,
1416
+ "step": 14800
1417
+ },
1418
+ {
1419
+ "epoch": 4.280379201378914,
1420
+ "grad_norm": 3.3168869018554688,
1421
+ "learning_rate": 5.039356506750934e-06,
1422
+ "loss": 0.905,
1423
+ "step": 14900
1424
+ },
1425
+ {
1426
+ "epoch": 4.309106578569376,
1427
+ "grad_norm": 4.996278285980225,
1428
+ "learning_rate": 4.838264866417696e-06,
1429
+ "loss": 0.9318,
1430
+ "step": 15000
1431
+ },
1432
+ {
1433
+ "epoch": 4.337833955759839,
1434
+ "grad_norm": 3.3141753673553467,
1435
+ "learning_rate": 4.637173226084458e-06,
1436
+ "loss": 0.9484,
1437
+ "step": 15100
1438
+ },
1439
+ {
1440
+ "epoch": 4.366561332950302,
1441
+ "grad_norm": 5.22437858581543,
1442
+ "learning_rate": 4.43608158575122e-06,
1443
+ "loss": 0.9509,
1444
+ "step": 15200
1445
+ },
1446
+ {
1447
+ "epoch": 4.366561332950302,
1448
+ "eval_bleu": 45.30090194764136,
1449
+ "eval_chrf": 69.70965638946056,
1450
+ "eval_loss": 0.9816325306892395,
1451
+ "eval_runtime": 96.1093,
1452
+ "eval_samples_per_second": 8.49,
1453
+ "eval_steps_per_second": 1.061,
1454
+ "step": 15200
1455
+ },
1456
+ {
1457
+ "epoch": 4.395288710140764,
1458
+ "grad_norm": 2.9990546703338623,
1459
+ "learning_rate": 4.234989945417983e-06,
1460
+ "loss": 0.9515,
1461
+ "step": 15300
1462
+ },
1463
+ {
1464
+ "epoch": 4.424016087331227,
1465
+ "grad_norm": 4.750415802001953,
1466
+ "learning_rate": 4.033898305084745e-06,
1467
+ "loss": 0.9212,
1468
+ "step": 15400
1469
+ },
1470
+ {
1471
+ "epoch": 4.4527434645216895,
1472
+ "grad_norm": 4.545401573181152,
1473
+ "learning_rate": 3.8328066647515075e-06,
1474
+ "loss": 0.9054,
1475
+ "step": 15500
1476
+ },
1477
+ {
1478
+ "epoch": 4.481470841712151,
1479
+ "grad_norm": 7.730736255645752,
1480
+ "learning_rate": 3.6317150244182706e-06,
1481
+ "loss": 0.9476,
1482
+ "step": 15600
1483
+ },
1484
+ {
1485
+ "epoch": 4.481470841712151,
1486
+ "eval_bleu": 45.50584441125612,
1487
+ "eval_chrf": 69.82630096157298,
1488
+ "eval_loss": 0.9792063236236572,
1489
+ "eval_runtime": 96.5086,
1490
+ "eval_samples_per_second": 8.455,
1491
+ "eval_steps_per_second": 1.057,
1492
+ "step": 15600
1493
+ },
1494
+ {
1495
+ "epoch": 4.510198218902614,
1496
+ "grad_norm": 5.245133876800537,
1497
+ "learning_rate": 3.4306233840850327e-06,
1498
+ "loss": 0.9284,
1499
+ "step": 15700
1500
+ },
1501
+ {
1502
+ "epoch": 4.538925596093077,
1503
+ "grad_norm": 8.488020896911621,
1504
+ "learning_rate": 3.229531743751795e-06,
1505
+ "loss": 0.9276,
1506
+ "step": 15800
1507
+ },
1508
+ {
1509
+ "epoch": 4.567652973283539,
1510
+ "grad_norm": 3.835139751434326,
1511
+ "learning_rate": 3.028440103418558e-06,
1512
+ "loss": 0.9217,
1513
+ "step": 15900
1514
+ },
1515
+ {
1516
+ "epoch": 4.596380350474002,
1517
+ "grad_norm": 7.393352508544922,
1518
+ "learning_rate": 2.82734846308532e-06,
1519
+ "loss": 0.9308,
1520
+ "step": 16000
1521
+ },
1522
+ {
1523
+ "epoch": 4.596380350474002,
1524
+ "eval_bleu": 45.590601518193715,
1525
+ "eval_chrf": 69.80956789483677,
1526
+ "eval_loss": 0.9778218865394592,
1527
+ "eval_runtime": 95.9136,
1528
+ "eval_samples_per_second": 8.508,
1529
+ "eval_steps_per_second": 1.063,
1530
+ "step": 16000
1531
+ },
1532
+ {
1533
+ "epoch": 4.6251077276644645,
1534
+ "grad_norm": 2.7596733570098877,
1535
+ "learning_rate": 2.6262568227520823e-06,
1536
+ "loss": 0.9257,
1537
+ "step": 16100
1538
+ },
1539
+ {
1540
+ "epoch": 4.6538351048549265,
1541
+ "grad_norm": 3.292307138442993,
1542
+ "learning_rate": 2.425165182418845e-06,
1543
+ "loss": 0.9412,
1544
+ "step": 16200
1545
+ },
1546
+ {
1547
+ "epoch": 4.682562482045389,
1548
+ "grad_norm": 4.457400798797607,
1549
+ "learning_rate": 2.2240735420856075e-06,
1550
+ "loss": 0.9047,
1551
+ "step": 16300
1552
+ },
1553
+ {
1554
+ "epoch": 4.711289859235852,
1555
+ "grad_norm": 3.834993362426758,
1556
+ "learning_rate": 2.0229819017523696e-06,
1557
+ "loss": 0.9473,
1558
+ "step": 16400
1559
+ },
1560
+ {
1561
+ "epoch": 4.711289859235852,
1562
+ "eval_bleu": 45.58715450124686,
1563
+ "eval_chrf": 69.84194121674784,
1564
+ "eval_loss": 0.9792063236236572,
1565
+ "eval_runtime": 96.0002,
1566
+ "eval_samples_per_second": 8.5,
1567
+ "eval_steps_per_second": 1.062,
1568
+ "step": 16400
1569
+ },
1570
+ {
1571
+ "epoch": 4.740017236426314,
1572
+ "grad_norm": 6.808932304382324,
1573
+ "learning_rate": 1.8218902614191322e-06,
1574
+ "loss": 0.936,
1575
+ "step": 16500
1576
+ },
1577
+ {
1578
+ "epoch": 4.768744613616777,
1579
+ "grad_norm": 4.363387584686279,
1580
+ "learning_rate": 1.6207986210858948e-06,
1581
+ "loss": 0.9441,
1582
+ "step": 16600
1583
+ },
1584
+ {
1585
+ "epoch": 4.79747199080724,
1586
+ "grad_norm": 4.4117207527160645,
1587
+ "learning_rate": 1.419706980752657e-06,
1588
+ "loss": 0.8911,
1589
+ "step": 16700
1590
+ },
1591
+ {
1592
+ "epoch": 4.8261993679977016,
1593
+ "grad_norm": 3.518254041671753,
1594
+ "learning_rate": 1.2186153404194196e-06,
1595
+ "loss": 0.8976,
1596
+ "step": 16800
1597
+ },
1598
+ {
1599
+ "epoch": 4.8261993679977016,
1600
+ "eval_bleu": 45.53459878251091,
1601
+ "eval_chrf": 69.83862737825717,
1602
+ "eval_loss": 0.9769607186317444,
1603
+ "eval_runtime": 97.0944,
1604
+ "eval_samples_per_second": 8.404,
1605
+ "eval_steps_per_second": 1.051,
1606
+ "step": 16800
1607
+ },
1608
+ {
1609
+ "epoch": 4.854926745188164,
1610
+ "grad_norm": 3.083101272583008,
1611
+ "learning_rate": 1.017523700086182e-06,
1612
+ "loss": 0.9441,
1613
+ "step": 16900
1614
+ },
1615
+ {
1616
+ "epoch": 4.883654122378627,
1617
+ "grad_norm": 3.4856061935424805,
1618
+ "learning_rate": 8.164320597529445e-07,
1619
+ "loss": 0.9502,
1620
+ "step": 17000
1621
+ },
1622
+ {
1623
+ "epoch": 4.912381499569089,
1624
+ "grad_norm": 4.190710067749023,
1625
+ "learning_rate": 6.15340419419707e-07,
1626
+ "loss": 0.8723,
1627
+ "step": 17100
1628
+ },
1629
+ {
1630
+ "epoch": 4.941108876759552,
1631
+ "grad_norm": 4.440817356109619,
1632
+ "learning_rate": 4.1424877908646937e-07,
1633
+ "loss": 0.8778,
1634
+ "step": 17200
1635
+ },
1636
+ {
1637
+ "epoch": 4.941108876759552,
1638
+ "eval_bleu": 45.58580948868277,
1639
+ "eval_chrf": 69.90139019717897,
1640
+ "eval_loss": 0.9762945175170898,
1641
+ "eval_runtime": 96.1964,
1642
+ "eval_samples_per_second": 8.483,
1643
+ "eval_steps_per_second": 1.06,
1644
+ "step": 17200
1645
+ },
1646
+ {
1647
+ "epoch": 4.969836253950015,
1648
+ "grad_norm": 4.020137786865234,
1649
+ "learning_rate": 2.131571387532318e-07,
1650
+ "loss": 0.9407,
1651
+ "step": 17300
1652
+ },
1653
+ {
1654
+ "epoch": 4.998563631140477,
1655
+ "grad_norm": 24.651016235351562,
1656
+ "learning_rate": 1.2065498419994253e-08,
1657
+ "loss": 0.9964,
1658
+ "step": 17400
1659
  }
1660
  ],
1661
  "logging_steps": 100,
1662
+ "max_steps": 17405,
1663
  "num_input_tokens_seen": 0,
1664
+ "num_train_epochs": 5,
1665
  "save_steps": 400,
1666
  "stateful_callbacks": {
1667
  "TrainerControl": {
 
1670
  "should_evaluate": false,
1671
  "should_log": false,
1672
  "should_save": true,
1673
+ "should_training_stop": true
1674
  },
1675
  "attributes": {}
1676
  }
1677
  },
1678
+ "total_flos": 4.76634010484736e+16,
1679
  "train_batch_size": 8,
1680
  "trial_name": null,
1681
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c569ee8bfa824e15f7ea34dff282d6cfba522dc5377663ba7e9fb590f959a53d
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1490ccf8fe424c3a3ae1d43d264b9201ab09ecd3fe396ea763d20c92d88354da
3
  size 5905