ParamDev commited on
Commit
7b17410
·
verified ·
1 Parent(s): 1b025e2

Training in progress, step 3058, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2547a1834088d21e98ec3e4e5e154a1c893c790db93141d6803b6c6b4e0cb520
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316b5b66c011ab655d2de0d0f9b4e3fb9737869533eff5fc1cb619905299a286
3
  size 167832240
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bad9c49e5757093c89864d172d61b76590dc6367e9b74336d43c7a9789677fb
3
  size 335929123
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51fea0149fe130b7385c5c2c92755ef0f42f6621fac6fad2ea562cd36563f8ab
3
  size 335929123
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8c95752f3156e70465d4d9b9d27e57d0dcf79ef565f88d74e68bd1ca64d1fe8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac6559199f848eb2e15456c1e5cee636970b1ba1477e47f5e59da91c5e0c169
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e44a8ea4f4d8bdd42319f9353c22a1e55c2bcf26c3f183ce2efea6ec7054b75c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3571d042b38772fa078cbf10ae0e9c85dcf4749dee484ca4e2ce1b0fc9a1dab4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 1529,
3
  "best_metric": 0.15474164485931396,
4
  "best_model_checkpoint": "./results_ner_lora_entity_aware/checkpoint-1529",
5
- "epoch": 0.4968318440292445,
6
  "eval_steps": 1529,
7
- "global_step": 1529,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1080,6 +1080,1085 @@
1080
  "eval_samples_per_second": 3.817,
1081
  "eval_steps_per_second": 3.817,
1082
  "step": 1529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1083
  }
1084
  ],
1085
  "logging_steps": 10,
@@ -1094,7 +2173,7 @@
1094
  "early_stopping_threshold": 0.0
1095
  },
1096
  "attributes": {
1097
- "early_stopping_patience_counter": 0
1098
  }
1099
  },
1100
  "TrainerControl": {
@@ -1108,7 +2187,7 @@
1108
  "attributes": {}
1109
  }
1110
  },
1111
- "total_flos": 1.188574066945622e+18,
1112
  "train_batch_size": 1,
1113
  "trial_name": null,
1114
  "trial_params": null
 
2
  "best_global_step": 1529,
3
  "best_metric": 0.15474164485931396,
4
  "best_model_checkpoint": "./results_ner_lora_entity_aware/checkpoint-1529",
5
+ "epoch": 0.993663688058489,
6
  "eval_steps": 1529,
7
+ "global_step": 3058,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1080
  "eval_samples_per_second": 3.817,
1081
  "eval_steps_per_second": 3.817,
1082
  "step": 1529
1083
+ },
1084
+ {
1085
+ "epoch": 0.49715678310316813,
1086
+ "grad_norm": 0.4904119372367859,
1087
+ "learning_rate": 4.481201106059251e-05,
1088
+ "loss": 0.0326,
1089
+ "step": 1530
1090
+ },
1091
+ {
1092
+ "epoch": 0.5004061738424046,
1093
+ "grad_norm": 0.41244634985923767,
1094
+ "learning_rate": 4.472981513766538e-05,
1095
+ "loss": 0.0296,
1096
+ "step": 1540
1097
+ },
1098
+ {
1099
+ "epoch": 0.503655564581641,
1100
+ "grad_norm": 0.4068795442581177,
1101
+ "learning_rate": 4.464704982847008e-05,
1102
+ "loss": 0.0345,
1103
+ "step": 1550
1104
+ },
1105
+ {
1106
+ "epoch": 0.5069049553208773,
1107
+ "grad_norm": 0.3623734712600708,
1108
+ "learning_rate": 4.456371752154549e-05,
1109
+ "loss": 0.0309,
1110
+ "step": 1560
1111
+ },
1112
+ {
1113
+ "epoch": 0.5101543460601138,
1114
+ "grad_norm": 0.3473750054836273,
1115
+ "learning_rate": 4.447982062179358e-05,
1116
+ "loss": 0.0313,
1117
+ "step": 1570
1118
+ },
1119
+ {
1120
+ "epoch": 0.5134037367993501,
1121
+ "grad_norm": 0.4023807644844055,
1122
+ "learning_rate": 4.439536155041e-05,
1123
+ "loss": 0.0328,
1124
+ "step": 1580
1125
+ },
1126
+ {
1127
+ "epoch": 0.5166531275385865,
1128
+ "grad_norm": 0.3513451814651489,
1129
+ "learning_rate": 4.4310342744814214e-05,
1130
+ "loss": 0.0343,
1131
+ "step": 1590
1132
+ },
1133
+ {
1134
+ "epoch": 0.5199025182778229,
1135
+ "grad_norm": 0.37276962399482727,
1136
+ "learning_rate": 4.4224766658579166e-05,
1137
+ "loss": 0.031,
1138
+ "step": 1600
1139
+ },
1140
+ {
1141
+ "epoch": 0.5231519090170593,
1142
+ "grad_norm": 0.456547349691391,
1143
+ "learning_rate": 4.413863576136044e-05,
1144
+ "loss": 0.0317,
1145
+ "step": 1610
1146
+ },
1147
+ {
1148
+ "epoch": 0.5264012997562957,
1149
+ "grad_norm": 0.3823520541191101,
1150
+ "learning_rate": 4.4051952538825034e-05,
1151
+ "loss": 0.0302,
1152
+ "step": 1620
1153
+ },
1154
+ {
1155
+ "epoch": 0.529650690495532,
1156
+ "grad_norm": 0.42634570598602295,
1157
+ "learning_rate": 4.3964719492579584e-05,
1158
+ "loss": 0.0288,
1159
+ "step": 1630
1160
+ },
1161
+ {
1162
+ "epoch": 0.5329000812347685,
1163
+ "grad_norm": 0.33916255831718445,
1164
+ "learning_rate": 4.387693914009819e-05,
1165
+ "loss": 0.0298,
1166
+ "step": 1640
1167
+ },
1168
+ {
1169
+ "epoch": 0.5361494719740049,
1170
+ "grad_norm": 0.441773921251297,
1171
+ "learning_rate": 4.3788614014649775e-05,
1172
+ "loss": 0.033,
1173
+ "step": 1650
1174
+ },
1175
+ {
1176
+ "epoch": 0.5393988627132412,
1177
+ "grad_norm": 0.35982316732406616,
1178
+ "learning_rate": 4.3699746665224945e-05,
1179
+ "loss": 0.0267,
1180
+ "step": 1660
1181
+ },
1182
+ {
1183
+ "epoch": 0.5426482534524777,
1184
+ "grad_norm": 0.3963621258735657,
1185
+ "learning_rate": 4.3610339656462445e-05,
1186
+ "loss": 0.0308,
1187
+ "step": 1670
1188
+ },
1189
+ {
1190
+ "epoch": 0.545897644191714,
1191
+ "grad_norm": 0.4419509172439575,
1192
+ "learning_rate": 4.352039556857516e-05,
1193
+ "loss": 0.0325,
1194
+ "step": 1680
1195
+ },
1196
+ {
1197
+ "epoch": 0.5491470349309504,
1198
+ "grad_norm": 0.4380287826061249,
1199
+ "learning_rate": 4.3429916997275626e-05,
1200
+ "loss": 0.0297,
1201
+ "step": 1690
1202
+ },
1203
+ {
1204
+ "epoch": 0.5523964256701869,
1205
+ "grad_norm": 0.27853381633758545,
1206
+ "learning_rate": 4.333890655370113e-05,
1207
+ "loss": 0.0314,
1208
+ "step": 1700
1209
+ },
1210
+ {
1211
+ "epoch": 0.5556458164094232,
1212
+ "grad_norm": 0.3319164514541626,
1213
+ "learning_rate": 4.324736686433837e-05,
1214
+ "loss": 0.0294,
1215
+ "step": 1710
1216
+ },
1217
+ {
1218
+ "epoch": 0.5588952071486596,
1219
+ "grad_norm": 0.5052310824394226,
1220
+ "learning_rate": 4.315530057094762e-05,
1221
+ "loss": 0.0314,
1222
+ "step": 1720
1223
+ },
1224
+ {
1225
+ "epoch": 0.5621445978878961,
1226
+ "grad_norm": 0.2669266164302826,
1227
+ "learning_rate": 4.306271033048655e-05,
1228
+ "loss": 0.0305,
1229
+ "step": 1730
1230
+ },
1231
+ {
1232
+ "epoch": 0.5653939886271324,
1233
+ "grad_norm": 0.3192387819290161,
1234
+ "learning_rate": 4.2969598815033476e-05,
1235
+ "loss": 0.0274,
1236
+ "step": 1740
1237
+ },
1238
+ {
1239
+ "epoch": 0.5686433793663688,
1240
+ "grad_norm": 0.5369754433631897,
1241
+ "learning_rate": 4.2875968711710286e-05,
1242
+ "loss": 0.032,
1243
+ "step": 1750
1244
+ },
1245
+ {
1246
+ "epoch": 0.5718927701056052,
1247
+ "grad_norm": 0.2641260623931885,
1248
+ "learning_rate": 4.2781822722604916e-05,
1249
+ "loss": 0.0272,
1250
+ "step": 1760
1251
+ },
1252
+ {
1253
+ "epoch": 0.5751421608448416,
1254
+ "grad_norm": 0.4835808575153351,
1255
+ "learning_rate": 4.268716356469331e-05,
1256
+ "loss": 0.0286,
1257
+ "step": 1770
1258
+ },
1259
+ {
1260
+ "epoch": 0.578391551584078,
1261
+ "grad_norm": 0.27514582872390747,
1262
+ "learning_rate": 4.259199396976107e-05,
1263
+ "loss": 0.0269,
1264
+ "step": 1780
1265
+ },
1266
+ {
1267
+ "epoch": 0.5816409423233144,
1268
+ "grad_norm": 0.3719632625579834,
1269
+ "learning_rate": 4.2496316684324585e-05,
1270
+ "loss": 0.029,
1271
+ "step": 1790
1272
+ },
1273
+ {
1274
+ "epoch": 0.5848903330625508,
1275
+ "grad_norm": 0.36428073048591614,
1276
+ "learning_rate": 4.2400134469551746e-05,
1277
+ "loss": 0.0304,
1278
+ "step": 1800
1279
+ },
1280
+ {
1281
+ "epoch": 0.5881397238017871,
1282
+ "grad_norm": 0.6899104714393616,
1283
+ "learning_rate": 4.230345010118233e-05,
1284
+ "loss": 0.0296,
1285
+ "step": 1810
1286
+ },
1287
+ {
1288
+ "epoch": 0.5913891145410236,
1289
+ "grad_norm": 0.3395729660987854,
1290
+ "learning_rate": 4.220626636944783e-05,
1291
+ "loss": 0.025,
1292
+ "step": 1820
1293
+ },
1294
+ {
1295
+ "epoch": 0.59463850528026,
1296
+ "grad_norm": 0.38969168066978455,
1297
+ "learning_rate": 4.2108586078990966e-05,
1298
+ "loss": 0.0248,
1299
+ "step": 1830
1300
+ },
1301
+ {
1302
+ "epoch": 0.5978878960194963,
1303
+ "grad_norm": 0.3905599117279053,
1304
+ "learning_rate": 4.2010412048784733e-05,
1305
+ "loss": 0.0257,
1306
+ "step": 1840
1307
+ },
1308
+ {
1309
+ "epoch": 0.6011372867587328,
1310
+ "grad_norm": 0.3760197162628174,
1311
+ "learning_rate": 4.191174711205105e-05,
1312
+ "loss": 0.0278,
1313
+ "step": 1850
1314
+ },
1315
+ {
1316
+ "epoch": 0.6043866774979691,
1317
+ "grad_norm": 0.4263547956943512,
1318
+ "learning_rate": 4.181259411617898e-05,
1319
+ "loss": 0.0248,
1320
+ "step": 1860
1321
+ },
1322
+ {
1323
+ "epoch": 0.6076360682372055,
1324
+ "grad_norm": 0.46628740429878235,
1325
+ "learning_rate": 4.1712955922642614e-05,
1326
+ "loss": 0.0274,
1327
+ "step": 1870
1328
+ },
1329
+ {
1330
+ "epoch": 0.6108854589764419,
1331
+ "grad_norm": 0.46346044540405273,
1332
+ "learning_rate": 4.161283540691841e-05,
1333
+ "loss": 0.0248,
1334
+ "step": 1880
1335
+ },
1336
+ {
1337
+ "epoch": 0.6141348497156783,
1338
+ "grad_norm": 0.37473928928375244,
1339
+ "learning_rate": 4.151223545840225e-05,
1340
+ "loss": 0.0272,
1341
+ "step": 1890
1342
+ },
1343
+ {
1344
+ "epoch": 0.6173842404549147,
1345
+ "grad_norm": 0.4056950807571411,
1346
+ "learning_rate": 4.141115898032607e-05,
1347
+ "loss": 0.024,
1348
+ "step": 1900
1349
+ },
1350
+ {
1351
+ "epoch": 0.620633631194151,
1352
+ "grad_norm": 0.4770098626613617,
1353
+ "learning_rate": 4.130960888967405e-05,
1354
+ "loss": 0.0237,
1355
+ "step": 1910
1356
+ },
1357
+ {
1358
+ "epoch": 0.6238830219333875,
1359
+ "grad_norm": 0.41782036423683167,
1360
+ "learning_rate": 4.1207588117098445e-05,
1361
+ "loss": 0.0272,
1362
+ "step": 1920
1363
+ },
1364
+ {
1365
+ "epoch": 0.6271324126726239,
1366
+ "grad_norm": 0.4040960669517517,
1367
+ "learning_rate": 4.1105099606835e-05,
1368
+ "loss": 0.0256,
1369
+ "step": 1930
1370
+ },
1371
+ {
1372
+ "epoch": 0.6303818034118602,
1373
+ "grad_norm": 0.3390992283821106,
1374
+ "learning_rate": 4.1002146316617986e-05,
1375
+ "loss": 0.0276,
1376
+ "step": 1940
1377
+ },
1378
+ {
1379
+ "epoch": 0.6336311941510967,
1380
+ "grad_norm": 0.4464505910873413,
1381
+ "learning_rate": 4.0898731217594836e-05,
1382
+ "loss": 0.0265,
1383
+ "step": 1950
1384
+ },
1385
+ {
1386
+ "epoch": 0.6368805848903331,
1387
+ "grad_norm": 0.46650540828704834,
1388
+ "learning_rate": 4.0794857294240415e-05,
1389
+ "loss": 0.0229,
1390
+ "step": 1960
1391
+ },
1392
+ {
1393
+ "epoch": 0.6401299756295694,
1394
+ "grad_norm": 0.39290061593055725,
1395
+ "learning_rate": 4.0690527544270886e-05,
1396
+ "loss": 0.0207,
1397
+ "step": 1970
1398
+ },
1399
+ {
1400
+ "epoch": 0.6433793663688059,
1401
+ "grad_norm": 0.38553354144096375,
1402
+ "learning_rate": 4.0585744978557174e-05,
1403
+ "loss": 0.0267,
1404
+ "step": 1980
1405
+ },
1406
+ {
1407
+ "epoch": 0.6466287571080422,
1408
+ "grad_norm": 0.456086128950119,
1409
+ "learning_rate": 4.048051262103811e-05,
1410
+ "loss": 0.0264,
1411
+ "step": 1990
1412
+ },
1413
+ {
1414
+ "epoch": 0.6498781478472786,
1415
+ "grad_norm": 0.5561078786849976,
1416
+ "learning_rate": 4.0374833508633156e-05,
1417
+ "loss": 0.0218,
1418
+ "step": 2000
1419
+ },
1420
+ {
1421
+ "epoch": 0.6531275385865151,
1422
+ "grad_norm": 0.3495825231075287,
1423
+ "learning_rate": 4.0268710691154724e-05,
1424
+ "loss": 0.0226,
1425
+ "step": 2010
1426
+ },
1427
+ {
1428
+ "epoch": 0.6563769293257514,
1429
+ "grad_norm": 0.4553760588169098,
1430
+ "learning_rate": 4.0162147231220216e-05,
1431
+ "loss": 0.024,
1432
+ "step": 2020
1433
+ },
1434
+ {
1435
+ "epoch": 0.6596263200649878,
1436
+ "grad_norm": 0.40624141693115234,
1437
+ "learning_rate": 4.0055146204163605e-05,
1438
+ "loss": 0.022,
1439
+ "step": 2030
1440
+ },
1441
+ {
1442
+ "epoch": 0.6628757108042242,
1443
+ "grad_norm": 0.37093663215637207,
1444
+ "learning_rate": 3.994771069794668e-05,
1445
+ "loss": 0.0241,
1446
+ "step": 2040
1447
+ },
1448
+ {
1449
+ "epoch": 0.6661251015434606,
1450
+ "grad_norm": 0.5362465977668762,
1451
+ "learning_rate": 3.9839843813069984e-05,
1452
+ "loss": 0.0246,
1453
+ "step": 2050
1454
+ },
1455
+ {
1456
+ "epoch": 0.669374492282697,
1457
+ "grad_norm": 0.4092622995376587,
1458
+ "learning_rate": 3.9731548662483234e-05,
1459
+ "loss": 0.0241,
1460
+ "step": 2060
1461
+ },
1462
+ {
1463
+ "epoch": 0.6726238830219334,
1464
+ "grad_norm": 0.41743412613868713,
1465
+ "learning_rate": 3.962282837149558e-05,
1466
+ "loss": 0.0232,
1467
+ "step": 2070
1468
+ },
1469
+ {
1470
+ "epoch": 0.6758732737611698,
1471
+ "grad_norm": 0.5507500171661377,
1472
+ "learning_rate": 3.951368607768537e-05,
1473
+ "loss": 0.0223,
1474
+ "step": 2080
1475
+ },
1476
+ {
1477
+ "epoch": 0.6791226645004061,
1478
+ "grad_norm": 0.45379722118377686,
1479
+ "learning_rate": 3.9404124930809625e-05,
1480
+ "loss": 0.0242,
1481
+ "step": 2090
1482
+ },
1483
+ {
1484
+ "epoch": 0.6823720552396426,
1485
+ "grad_norm": 0.36823663115501404,
1486
+ "learning_rate": 3.929414809271308e-05,
1487
+ "loss": 0.0265,
1488
+ "step": 2100
1489
+ },
1490
+ {
1491
+ "epoch": 0.685621445978879,
1492
+ "grad_norm": 0.3751804530620575,
1493
+ "learning_rate": 3.918375873723701e-05,
1494
+ "loss": 0.0245,
1495
+ "step": 2110
1496
+ },
1497
+ {
1498
+ "epoch": 0.6888708367181153,
1499
+ "grad_norm": 0.4553548991680145,
1500
+ "learning_rate": 3.907296005012758e-05,
1501
+ "loss": 0.0257,
1502
+ "step": 2120
1503
+ },
1504
+ {
1505
+ "epoch": 0.6921202274573518,
1506
+ "grad_norm": 0.3335023820400238,
1507
+ "learning_rate": 3.896175522894395e-05,
1508
+ "loss": 0.0226,
1509
+ "step": 2130
1510
+ },
1511
+ {
1512
+ "epoch": 0.6953696181965882,
1513
+ "grad_norm": 0.4702470600605011,
1514
+ "learning_rate": 3.8850147482965973e-05,
1515
+ "loss": 0.0218,
1516
+ "step": 2140
1517
+ },
1518
+ {
1519
+ "epoch": 0.6986190089358245,
1520
+ "grad_norm": 0.39953991770744324,
1521
+ "learning_rate": 3.873814003310158e-05,
1522
+ "loss": 0.0194,
1523
+ "step": 2150
1524
+ },
1525
+ {
1526
+ "epoch": 0.7018683996750609,
1527
+ "grad_norm": 0.5067981481552124,
1528
+ "learning_rate": 3.862573611179381e-05,
1529
+ "loss": 0.022,
1530
+ "step": 2160
1531
+ },
1532
+ {
1533
+ "epoch": 0.7051177904142973,
1534
+ "grad_norm": 0.30548062920570374,
1535
+ "learning_rate": 3.851293896292756e-05,
1536
+ "loss": 0.0171,
1537
+ "step": 2170
1538
+ },
1539
+ {
1540
+ "epoch": 0.7083671811535337,
1541
+ "grad_norm": 0.39521142840385437,
1542
+ "learning_rate": 3.839975184173596e-05,
1543
+ "loss": 0.0192,
1544
+ "step": 2180
1545
+ },
1546
+ {
1547
+ "epoch": 0.7116165718927701,
1548
+ "grad_norm": 0.36969834566116333,
1549
+ "learning_rate": 3.8286178014706395e-05,
1550
+ "loss": 0.0245,
1551
+ "step": 2190
1552
+ },
1553
+ {
1554
+ "epoch": 0.7148659626320065,
1555
+ "grad_norm": 0.4855635166168213,
1556
+ "learning_rate": 3.8172220759486287e-05,
1557
+ "loss": 0.0229,
1558
+ "step": 2200
1559
+ },
1560
+ {
1561
+ "epoch": 0.7181153533712429,
1562
+ "grad_norm": 0.45929041504859924,
1563
+ "learning_rate": 3.8057883364788475e-05,
1564
+ "loss": 0.0186,
1565
+ "step": 2210
1566
+ },
1567
+ {
1568
+ "epoch": 0.7213647441104792,
1569
+ "grad_norm": 0.2941083610057831,
1570
+ "learning_rate": 3.7943169130296295e-05,
1571
+ "loss": 0.0188,
1572
+ "step": 2220
1573
+ },
1574
+ {
1575
+ "epoch": 0.7246141348497157,
1576
+ "grad_norm": 0.3690025806427002,
1577
+ "learning_rate": 3.782808136656839e-05,
1578
+ "loss": 0.0188,
1579
+ "step": 2230
1580
+ },
1581
+ {
1582
+ "epoch": 0.7278635255889521,
1583
+ "grad_norm": 0.38714373111724854,
1584
+ "learning_rate": 3.771262339494314e-05,
1585
+ "loss": 0.0191,
1586
+ "step": 2240
1587
+ },
1588
+ {
1589
+ "epoch": 0.7311129163281884,
1590
+ "grad_norm": 0.40680810809135437,
1591
+ "learning_rate": 3.759679854744282e-05,
1592
+ "loss": 0.0197,
1593
+ "step": 2250
1594
+ },
1595
+ {
1596
+ "epoch": 0.7343623070674249,
1597
+ "grad_norm": 0.41902831196784973,
1598
+ "learning_rate": 3.748061016667745e-05,
1599
+ "loss": 0.0205,
1600
+ "step": 2260
1601
+ },
1602
+ {
1603
+ "epoch": 0.7376116978066612,
1604
+ "grad_norm": 0.4369294047355652,
1605
+ "learning_rate": 3.736406160574833e-05,
1606
+ "loss": 0.019,
1607
+ "step": 2270
1608
+ },
1609
+ {
1610
+ "epoch": 0.7408610885458976,
1611
+ "grad_norm": 0.3856930732727051,
1612
+ "learning_rate": 3.724715622815122e-05,
1613
+ "loss": 0.022,
1614
+ "step": 2280
1615
+ },
1616
+ {
1617
+ "epoch": 0.7441104792851341,
1618
+ "grad_norm": 0.34679755568504333,
1619
+ "learning_rate": 3.712989740767938e-05,
1620
+ "loss": 0.0164,
1621
+ "step": 2290
1622
+ },
1623
+ {
1624
+ "epoch": 0.7473598700243704,
1625
+ "grad_norm": 0.3927323818206787,
1626
+ "learning_rate": 3.7012288528326086e-05,
1627
+ "loss": 0.0181,
1628
+ "step": 2300
1629
+ },
1630
+ {
1631
+ "epoch": 0.7506092607636068,
1632
+ "grad_norm": 0.4021192491054535,
1633
+ "learning_rate": 3.689433298418706e-05,
1634
+ "loss": 0.0159,
1635
+ "step": 2310
1636
+ },
1637
+ {
1638
+ "epoch": 0.7538586515028433,
1639
+ "grad_norm": 0.48003751039505005,
1640
+ "learning_rate": 3.6776034179362474e-05,
1641
+ "loss": 0.0177,
1642
+ "step": 2320
1643
+ },
1644
+ {
1645
+ "epoch": 0.7571080422420796,
1646
+ "grad_norm": 0.3487580716609955,
1647
+ "learning_rate": 3.66573955278587e-05,
1648
+ "loss": 0.0162,
1649
+ "step": 2330
1650
+ },
1651
+ {
1652
+ "epoch": 0.760357432981316,
1653
+ "grad_norm": 0.47422289848327637,
1654
+ "learning_rate": 3.653842045348985e-05,
1655
+ "loss": 0.018,
1656
+ "step": 2340
1657
+ },
1658
+ {
1659
+ "epoch": 0.7636068237205524,
1660
+ "grad_norm": 0.38853368163108826,
1661
+ "learning_rate": 3.64191123897789e-05,
1662
+ "loss": 0.0229,
1663
+ "step": 2350
1664
+ },
1665
+ {
1666
+ "epoch": 0.7668562144597888,
1667
+ "grad_norm": 0.36860230565071106,
1668
+ "learning_rate": 3.62994747798586e-05,
1669
+ "loss": 0.018,
1670
+ "step": 2360
1671
+ },
1672
+ {
1673
+ "epoch": 0.7701056051990252,
1674
+ "grad_norm": 0.4562481939792633,
1675
+ "learning_rate": 3.617951107637219e-05,
1676
+ "loss": 0.0191,
1677
+ "step": 2370
1678
+ },
1679
+ {
1680
+ "epoch": 0.7733549959382616,
1681
+ "grad_norm": 0.708402156829834,
1682
+ "learning_rate": 3.605922474137366e-05,
1683
+ "loss": 0.019,
1684
+ "step": 2380
1685
+ },
1686
+ {
1687
+ "epoch": 0.776604386677498,
1688
+ "grad_norm": 0.48525258898735046,
1689
+ "learning_rate": 3.5938619246227884e-05,
1690
+ "loss": 0.0217,
1691
+ "step": 2390
1692
+ },
1693
+ {
1694
+ "epoch": 0.7798537774167343,
1695
+ "grad_norm": 0.3320712447166443,
1696
+ "learning_rate": 3.581769807151044e-05,
1697
+ "loss": 0.0195,
1698
+ "step": 2400
1699
+ },
1700
+ {
1701
+ "epoch": 0.7831031681559708,
1702
+ "grad_norm": 0.36696651577949524,
1703
+ "learning_rate": 3.56964647069072e-05,
1704
+ "loss": 0.0192,
1705
+ "step": 2410
1706
+ },
1707
+ {
1708
+ "epoch": 0.7863525588952072,
1709
+ "grad_norm": 1.0005451440811157,
1710
+ "learning_rate": 3.55749226511135e-05,
1711
+ "loss": 0.0196,
1712
+ "step": 2420
1713
+ },
1714
+ {
1715
+ "epoch": 0.7896019496344435,
1716
+ "grad_norm": 0.45593878626823425,
1717
+ "learning_rate": 3.54530754117333e-05,
1718
+ "loss": 0.0225,
1719
+ "step": 2430
1720
+ },
1721
+ {
1722
+ "epoch": 0.7928513403736799,
1723
+ "grad_norm": 0.33774876594543457,
1724
+ "learning_rate": 3.533092650517793e-05,
1725
+ "loss": 0.0199,
1726
+ "step": 2440
1727
+ },
1728
+ {
1729
+ "epoch": 0.7961007311129163,
1730
+ "grad_norm": 0.483853816986084,
1731
+ "learning_rate": 3.5208479456564524e-05,
1732
+ "loss": 0.0206,
1733
+ "step": 2450
1734
+ },
1735
+ {
1736
+ "epoch": 0.7993501218521527,
1737
+ "grad_norm": 0.36340585350990295,
1738
+ "learning_rate": 3.508573779961441e-05,
1739
+ "loss": 0.0172,
1740
+ "step": 2460
1741
+ },
1742
+ {
1743
+ "epoch": 0.8025995125913891,
1744
+ "grad_norm": 0.515352725982666,
1745
+ "learning_rate": 3.4962705076551026e-05,
1746
+ "loss": 0.0151,
1747
+ "step": 2470
1748
+ },
1749
+ {
1750
+ "epoch": 0.8058489033306255,
1751
+ "grad_norm": 0.3187580406665802,
1752
+ "learning_rate": 3.483938483799778e-05,
1753
+ "loss": 0.0148,
1754
+ "step": 2480
1755
+ },
1756
+ {
1757
+ "epoch": 0.8090982940698619,
1758
+ "grad_norm": 0.4531770646572113,
1759
+ "learning_rate": 3.47157806428755e-05,
1760
+ "loss": 0.0158,
1761
+ "step": 2490
1762
+ },
1763
+ {
1764
+ "epoch": 0.8123476848090982,
1765
+ "grad_norm": 0.6547293066978455,
1766
+ "learning_rate": 3.45918960582998e-05,
1767
+ "loss": 0.0144,
1768
+ "step": 2500
1769
+ },
1770
+ {
1771
+ "epoch": 0.8155970755483347,
1772
+ "grad_norm": 0.24430936574935913,
1773
+ "learning_rate": 3.446773465947809e-05,
1774
+ "loss": 0.0148,
1775
+ "step": 2510
1776
+ },
1777
+ {
1778
+ "epoch": 0.8188464662875711,
1779
+ "grad_norm": 0.425484299659729,
1780
+ "learning_rate": 3.4343300029606404e-05,
1781
+ "loss": 0.0192,
1782
+ "step": 2520
1783
+ },
1784
+ {
1785
+ "epoch": 0.8220958570268074,
1786
+ "grad_norm": 0.43435633182525635,
1787
+ "learning_rate": 3.4218595759766013e-05,
1788
+ "loss": 0.0192,
1789
+ "step": 2530
1790
+ },
1791
+ {
1792
+ "epoch": 0.8253452477660439,
1793
+ "grad_norm": 0.499104380607605,
1794
+ "learning_rate": 3.409362544881977e-05,
1795
+ "loss": 0.0172,
1796
+ "step": 2540
1797
+ },
1798
+ {
1799
+ "epoch": 0.8285946385052803,
1800
+ "grad_norm": 0.5690107941627502,
1801
+ "learning_rate": 3.3968392703308264e-05,
1802
+ "loss": 0.017,
1803
+ "step": 2550
1804
+ },
1805
+ {
1806
+ "epoch": 0.8318440292445166,
1807
+ "grad_norm": 0.4789363145828247,
1808
+ "learning_rate": 3.3842901137345725e-05,
1809
+ "loss": 0.0151,
1810
+ "step": 2560
1811
+ },
1812
+ {
1813
+ "epoch": 0.8350934199837531,
1814
+ "grad_norm": 0.34053072333335876,
1815
+ "learning_rate": 3.3717154372515716e-05,
1816
+ "loss": 0.0155,
1817
+ "step": 2570
1818
+ },
1819
+ {
1820
+ "epoch": 0.8383428107229894,
1821
+ "grad_norm": 0.44401687383651733,
1822
+ "learning_rate": 3.3591156037766655e-05,
1823
+ "loss": 0.0138,
1824
+ "step": 2580
1825
+ },
1826
+ {
1827
+ "epoch": 0.8415922014622258,
1828
+ "grad_norm": 0.41682168841362,
1829
+ "learning_rate": 3.346490976930704e-05,
1830
+ "loss": 0.014,
1831
+ "step": 2590
1832
+ },
1833
+ {
1834
+ "epoch": 0.8448415922014623,
1835
+ "grad_norm": 0.6371095776557922,
1836
+ "learning_rate": 3.333841921050053e-05,
1837
+ "loss": 0.0176,
1838
+ "step": 2600
1839
+ },
1840
+ {
1841
+ "epoch": 0.8480909829406986,
1842
+ "grad_norm": 0.22434203326702118,
1843
+ "learning_rate": 3.3211688011760835e-05,
1844
+ "loss": 0.0123,
1845
+ "step": 2610
1846
+ },
1847
+ {
1848
+ "epoch": 0.851340373679935,
1849
+ "grad_norm": 0.5385378003120422,
1850
+ "learning_rate": 3.30847198304463e-05,
1851
+ "loss": 0.016,
1852
+ "step": 2620
1853
+ },
1854
+ {
1855
+ "epoch": 0.8545897644191714,
1856
+ "grad_norm": 0.35033077001571655,
1857
+ "learning_rate": 3.2957518330754406e-05,
1858
+ "loss": 0.0149,
1859
+ "step": 2630
1860
+ },
1861
+ {
1862
+ "epoch": 0.8578391551584078,
1863
+ "grad_norm": 0.3793995976448059,
1864
+ "learning_rate": 3.2830087183616015e-05,
1865
+ "loss": 0.0153,
1866
+ "step": 2640
1867
+ },
1868
+ {
1869
+ "epoch": 0.8610885458976442,
1870
+ "grad_norm": 0.6376614570617676,
1871
+ "learning_rate": 3.270243006658942e-05,
1872
+ "loss": 0.0154,
1873
+ "step": 2650
1874
+ },
1875
+ {
1876
+ "epoch": 0.8643379366368806,
1877
+ "grad_norm": 0.3490144908428192,
1878
+ "learning_rate": 3.257455066375423e-05,
1879
+ "loss": 0.0154,
1880
+ "step": 2660
1881
+ },
1882
+ {
1883
+ "epoch": 0.867587327376117,
1884
+ "grad_norm": 0.40602752566337585,
1885
+ "learning_rate": 3.244645266560501e-05,
1886
+ "loss": 0.0136,
1887
+ "step": 2670
1888
+ },
1889
+ {
1890
+ "epoch": 0.8708367181153533,
1891
+ "grad_norm": 0.37300461530685425,
1892
+ "learning_rate": 3.2318139768944856e-05,
1893
+ "loss": 0.0127,
1894
+ "step": 2680
1895
+ },
1896
+ {
1897
+ "epoch": 0.8740861088545898,
1898
+ "grad_norm": 0.3366054594516754,
1899
+ "learning_rate": 3.218961567677861e-05,
1900
+ "loss": 0.0142,
1901
+ "step": 2690
1902
+ },
1903
+ {
1904
+ "epoch": 0.8773354995938262,
1905
+ "grad_norm": 0.4088799059391022,
1906
+ "learning_rate": 3.206088409820606e-05,
1907
+ "loss": 0.0143,
1908
+ "step": 2700
1909
+ },
1910
+ {
1911
+ "epoch": 0.8805848903330625,
1912
+ "grad_norm": 0.3606589734554291,
1913
+ "learning_rate": 3.19319487483149e-05,
1914
+ "loss": 0.0125,
1915
+ "step": 2710
1916
+ },
1917
+ {
1918
+ "epoch": 0.8838342810722989,
1919
+ "grad_norm": 0.48760858178138733,
1920
+ "learning_rate": 3.180281334807348e-05,
1921
+ "loss": 0.0121,
1922
+ "step": 2720
1923
+ },
1924
+ {
1925
+ "epoch": 0.8870836718115354,
1926
+ "grad_norm": 0.4494096040725708,
1927
+ "learning_rate": 3.1673481624223426e-05,
1928
+ "loss": 0.0123,
1929
+ "step": 2730
1930
+ },
1931
+ {
1932
+ "epoch": 0.8903330625507717,
1933
+ "grad_norm": 0.4649289846420288,
1934
+ "learning_rate": 3.154395730917213e-05,
1935
+ "loss": 0.0135,
1936
+ "step": 2740
1937
+ },
1938
+ {
1939
+ "epoch": 0.8935824532900081,
1940
+ "grad_norm": 0.3580702841281891,
1941
+ "learning_rate": 3.141424414088499e-05,
1942
+ "loss": 0.014,
1943
+ "step": 2750
1944
+ },
1945
+ {
1946
+ "epoch": 0.8968318440292445,
1947
+ "grad_norm": 0.2921467423439026,
1948
+ "learning_rate": 3.128434586277757e-05,
1949
+ "loss": 0.0146,
1950
+ "step": 2760
1951
+ },
1952
+ {
1953
+ "epoch": 0.9000812347684809,
1954
+ "grad_norm": 0.5378055572509766,
1955
+ "learning_rate": 3.115426622360752e-05,
1956
+ "loss": 0.0131,
1957
+ "step": 2770
1958
+ },
1959
+ {
1960
+ "epoch": 0.9033306255077173,
1961
+ "grad_norm": 0.34166646003723145,
1962
+ "learning_rate": 3.102400897736645e-05,
1963
+ "loss": 0.0123,
1964
+ "step": 2780
1965
+ },
1966
+ {
1967
+ "epoch": 0.9065800162469537,
1968
+ "grad_norm": 0.43183135986328125,
1969
+ "learning_rate": 3.0893577883171556e-05,
1970
+ "loss": 0.0151,
1971
+ "step": 2790
1972
+ },
1973
+ {
1974
+ "epoch": 0.9098294069861901,
1975
+ "grad_norm": 0.6324542760848999,
1976
+ "learning_rate": 3.076297670515713e-05,
1977
+ "loss": 0.0128,
1978
+ "step": 2800
1979
+ },
1980
+ {
1981
+ "epoch": 0.9130787977254264,
1982
+ "grad_norm": 0.43282851576805115,
1983
+ "learning_rate": 3.063220921236598e-05,
1984
+ "loss": 0.0129,
1985
+ "step": 2810
1986
+ },
1987
+ {
1988
+ "epoch": 0.9163281884646629,
1989
+ "grad_norm": 0.2942393124103546,
1990
+ "learning_rate": 3.0501279178640575e-05,
1991
+ "loss": 0.0131,
1992
+ "step": 2820
1993
+ },
1994
+ {
1995
+ "epoch": 0.9195775792038993,
1996
+ "grad_norm": 0.32284924387931824,
1997
+ "learning_rate": 3.0370190382514213e-05,
1998
+ "loss": 0.0103,
1999
+ "step": 2830
2000
+ },
2001
+ {
2002
+ "epoch": 0.9228269699431356,
2003
+ "grad_norm": 0.38951289653778076,
2004
+ "learning_rate": 3.0238946607101936e-05,
2005
+ "loss": 0.0105,
2006
+ "step": 2840
2007
+ },
2008
+ {
2009
+ "epoch": 0.9260763606823721,
2010
+ "grad_norm": 0.407099187374115,
2011
+ "learning_rate": 3.0107551639991365e-05,
2012
+ "loss": 0.0109,
2013
+ "step": 2850
2014
+ },
2015
+ {
2016
+ "epoch": 0.9293257514216084,
2017
+ "grad_norm": 0.5025432705879211,
2018
+ "learning_rate": 2.997600927313338e-05,
2019
+ "loss": 0.0115,
2020
+ "step": 2860
2021
+ },
2022
+ {
2023
+ "epoch": 0.9325751421608448,
2024
+ "grad_norm": 0.18864522874355316,
2025
+ "learning_rate": 2.98443233027327e-05,
2026
+ "loss": 0.011,
2027
+ "step": 2870
2028
+ },
2029
+ {
2030
+ "epoch": 0.9358245329000813,
2031
+ "grad_norm": 0.3602016866207123,
2032
+ "learning_rate": 2.971249752913834e-05,
2033
+ "loss": 0.012,
2034
+ "step": 2880
2035
+ },
2036
+ {
2037
+ "epoch": 0.9390739236393176,
2038
+ "grad_norm": 0.38901951909065247,
2039
+ "learning_rate": 2.958053575673389e-05,
2040
+ "loss": 0.0113,
2041
+ "step": 2890
2042
+ },
2043
+ {
2044
+ "epoch": 0.942323314378554,
2045
+ "grad_norm": 0.3340912163257599,
2046
+ "learning_rate": 2.944844179382778e-05,
2047
+ "loss": 0.0102,
2048
+ "step": 2900
2049
+ },
2050
+ {
2051
+ "epoch": 0.9455727051177905,
2052
+ "grad_norm": 0.32333633303642273,
2053
+ "learning_rate": 2.931621945254334e-05,
2054
+ "loss": 0.0117,
2055
+ "step": 2910
2056
+ },
2057
+ {
2058
+ "epoch": 0.9488220958570268,
2059
+ "grad_norm": 0.45609724521636963,
2060
+ "learning_rate": 2.918387254870879e-05,
2061
+ "loss": 0.0114,
2062
+ "step": 2920
2063
+ },
2064
+ {
2065
+ "epoch": 0.9520714865962632,
2066
+ "grad_norm": 0.3951948285102844,
2067
+ "learning_rate": 2.905140490174713e-05,
2068
+ "loss": 0.0099,
2069
+ "step": 2930
2070
+ },
2071
+ {
2072
+ "epoch": 0.9553208773354996,
2073
+ "grad_norm": 0.4054865539073944,
2074
+ "learning_rate": 2.8918820334565905e-05,
2075
+ "loss": 0.0118,
2076
+ "step": 2940
2077
+ },
2078
+ {
2079
+ "epoch": 0.958570268074736,
2080
+ "grad_norm": 0.40964823961257935,
2081
+ "learning_rate": 2.8786122673446893e-05,
2082
+ "loss": 0.0113,
2083
+ "step": 2950
2084
+ },
2085
+ {
2086
+ "epoch": 0.9618196588139724,
2087
+ "grad_norm": 0.37629735469818115,
2088
+ "learning_rate": 2.865331574793564e-05,
2089
+ "loss": 0.0112,
2090
+ "step": 2960
2091
+ },
2092
+ {
2093
+ "epoch": 0.9650690495532088,
2094
+ "grad_norm": 0.3705214262008667,
2095
+ "learning_rate": 2.8520403390731e-05,
2096
+ "loss": 0.0117,
2097
+ "step": 2970
2098
+ },
2099
+ {
2100
+ "epoch": 0.9683184402924452,
2101
+ "grad_norm": 0.3900469243526459,
2102
+ "learning_rate": 2.8387389437574495e-05,
2103
+ "loss": 0.0108,
2104
+ "step": 2980
2105
+ },
2106
+ {
2107
+ "epoch": 0.9715678310316815,
2108
+ "grad_norm": 0.2905224561691284,
2109
+ "learning_rate": 2.8254277727139616e-05,
2110
+ "loss": 0.0112,
2111
+ "step": 2990
2112
+ },
2113
+ {
2114
+ "epoch": 0.974817221770918,
2115
+ "grad_norm": 0.6466222405433655,
2116
+ "learning_rate": 2.812107210092105e-05,
2117
+ "loss": 0.0124,
2118
+ "step": 3000
2119
+ },
2120
+ {
2121
+ "epoch": 0.9780666125101544,
2122
+ "grad_norm": 0.5542816519737244,
2123
+ "learning_rate": 2.798777640312381e-05,
2124
+ "loss": 0.0112,
2125
+ "step": 3010
2126
+ },
2127
+ {
2128
+ "epoch": 0.9813160032493907,
2129
+ "grad_norm": 0.3827550411224365,
2130
+ "learning_rate": 2.7854394480552327e-05,
2131
+ "loss": 0.0112,
2132
+ "step": 3020
2133
+ },
2134
+ {
2135
+ "epoch": 0.9845653939886271,
2136
+ "grad_norm": 0.32786279916763306,
2137
+ "learning_rate": 2.7720930182499367e-05,
2138
+ "loss": 0.0115,
2139
+ "step": 3030
2140
+ },
2141
+ {
2142
+ "epoch": 0.9878147847278635,
2143
+ "grad_norm": 0.537501335144043,
2144
+ "learning_rate": 2.7587387360635032e-05,
2145
+ "loss": 0.0113,
2146
+ "step": 3040
2147
+ },
2148
+ {
2149
+ "epoch": 0.9910641754670999,
2150
+ "grad_norm": 0.47765976190567017,
2151
+ "learning_rate": 2.7453769868895518e-05,
2152
+ "loss": 0.0141,
2153
+ "step": 3050
2154
+ },
2155
+ {
2156
+ "epoch": 0.993663688058489,
2157
+ "eval_loss": 0.1697877049446106,
2158
+ "eval_runtime": 733.8424,
2159
+ "eval_samples_per_second": 3.816,
2160
+ "eval_steps_per_second": 3.816,
2161
+ "step": 3058
2162
  }
2163
  ],
2164
  "logging_steps": 10,
 
2173
  "early_stopping_threshold": 0.0
2174
  },
2175
  "attributes": {
2176
+ "early_stopping_patience_counter": 1
2177
  }
2178
  },
2179
  "TrainerControl": {
 
2187
  "attributes": {}
2188
  }
2189
  },
2190
+ "total_flos": 2.385847921489871e+18,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null