ErrorAI commited on
Commit
41176bd
·
verified ·
1 Parent(s): 9879714

Training in progress, step 356, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59a559adb82ba9e02f825b2265c3ee9c8f41dfeeeabcafdb3e28bc5d9402787c
3
  size 125918320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e481995cf6d237020e6aab4a1e7404637d3ca4faf64b643a18771e95eaa340eb
3
  size 125918320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:894c133336c6eeb60b9946b1f23cd32081a17ffd17960c9430aff222a2132fb1
3
- size 64683604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c5f87aedc82c3a0d97bdd0013b3b34f2f23143184a093d55e9a14b33b4ccd6b
3
+ size 64684244
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41135de3aae46e55a15430a5e662d81ab71a89cb89c7fe5caa86792391b74f3b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f940370db0768ac9a318b750a653da4911f1db917a73143506c211e803ef55e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd3fa22a75ad90fa3285176fb3ab9ab3749a31cede267438251b8820c7148cb7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17d88a80230fc1c998d77e01d3856d0912d9f20dd4f54f9e81578aede7b8a3ec
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25061598028863075,
5
  "eval_steps": 500,
6
- "global_step": 178,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1253,6 +1253,1252 @@
1253
  "learning_rate": 8.590166950903118e-05,
1254
  "loss": 0.6124,
1255
  "step": 178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1256
  }
1257
  ],
1258
  "logging_steps": 1,
@@ -1272,7 +2518,7 @@
1272
  "attributes": {}
1273
  }
1274
  },
1275
- "total_flos": 1.860383410350981e+17,
1276
  "train_batch_size": 4,
1277
  "trial_name": null,
1278
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5012319605772615,
5
  "eval_steps": 500,
6
+ "global_step": 356,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1253
  "learning_rate": 8.590166950903118e-05,
1254
  "loss": 0.6124,
1255
  "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.2520239352340725,
1259
+ "grad_norm": 14.029097557067871,
1260
+ "learning_rate": 8.574645793874449e-05,
1261
+ "loss": 0.8471,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.25343189017951423,
1266
+ "grad_norm": 34.442237854003906,
1267
+ "learning_rate": 8.559053854876494e-05,
1268
+ "loss": 1.8921,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.254839845124956,
1273
+ "grad_norm": 17.151165008544922,
1274
+ "learning_rate": 8.543391442647013e-05,
1275
+ "loss": 1.1393,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.25624780007039777,
1280
+ "grad_norm": 16.576353073120117,
1281
+ "learning_rate": 8.527658867319221e-05,
1282
+ "loss": 0.4902,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.2576557550158395,
1287
+ "grad_norm": 31.439992904663086,
1288
+ "learning_rate": 8.511856440415635e-05,
1289
+ "loss": 1.7562,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.25906370996128125,
1294
+ "grad_norm": 102.67780303955078,
1295
+ "learning_rate": 8.495984474841918e-05,
1296
+ "loss": 4.32,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.26047166490672297,
1301
+ "grad_norm": 45.334354400634766,
1302
+ "learning_rate": 8.480043284880666e-05,
1303
+ "loss": 2.0922,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.26187961985216474,
1308
+ "grad_norm": 37.032569885253906,
1309
+ "learning_rate": 8.464033186185203e-05,
1310
+ "loss": 1.9404,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.26328757479760645,
1315
+ "grad_norm": 33.529842376708984,
1316
+ "learning_rate": 8.44795449577332e-05,
1317
+ "loss": 1.5912,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.2646955297430482,
1322
+ "grad_norm": 53.00758361816406,
1323
+ "learning_rate": 8.431807532021e-05,
1324
+ "loss": 3.0904,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.26610348468849,
1329
+ "grad_norm": 31.37575340270996,
1330
+ "learning_rate": 8.415592614656119e-05,
1331
+ "loss": 1.6186,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 0.2675114396339317,
1336
+ "grad_norm": 29.855907440185547,
1337
+ "learning_rate": 8.399310064752099e-05,
1338
+ "loss": 1.8013,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 0.26891939457937347,
1343
+ "grad_norm": 43.18759536743164,
1344
+ "learning_rate": 8.382960204721575e-05,
1345
+ "loss": 1.6222,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 0.2703273495248152,
1350
+ "grad_norm": 31.552494049072266,
1351
+ "learning_rate": 8.366543358309989e-05,
1352
+ "loss": 1.1728,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 0.27173530447025696,
1357
+ "grad_norm": 14.046395301818848,
1358
+ "learning_rate": 8.35005985058919e-05,
1359
+ "loss": 0.6825,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 0.2731432594156987,
1364
+ "grad_norm": 23.78717613220215,
1365
+ "learning_rate": 8.333510007950995e-05,
1366
+ "loss": 1.9934,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 0.27455121436114044,
1371
+ "grad_norm": 19.5039005279541,
1372
+ "learning_rate": 8.316894158100727e-05,
1373
+ "loss": 1.3594,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 0.2759591693065822,
1378
+ "grad_norm": 14.477341651916504,
1379
+ "learning_rate": 8.300212630050726e-05,
1380
+ "loss": 0.7189,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 0.2773671242520239,
1385
+ "grad_norm": 17.81009864807129,
1386
+ "learning_rate": 8.28346575411383e-05,
1387
+ "loss": 1.1535,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 0.2787750791974657,
1392
+ "grad_norm": 17.172264099121094,
1393
+ "learning_rate": 8.266653861896843e-05,
1394
+ "loss": 1.169,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 0.2801830341429074,
1399
+ "grad_norm": 57.817726135253906,
1400
+ "learning_rate": 8.24977728629396e-05,
1401
+ "loss": 2.9405,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 0.2815909890883492,
1406
+ "grad_norm": 177.61752319335938,
1407
+ "learning_rate": 8.232836361480183e-05,
1408
+ "loss": 4.7211,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 0.28299894403379094,
1413
+ "grad_norm": 27.973918914794922,
1414
+ "learning_rate": 8.215831422904694e-05,
1415
+ "loss": 1.0981,
1416
+ "step": 201
1417
+ },
1418
+ {
1419
+ "epoch": 0.28440689897923266,
1420
+ "grad_norm": 29.390439987182617,
1421
+ "learning_rate": 8.198762807284224e-05,
1422
+ "loss": 1.7834,
1423
+ "step": 202
1424
+ },
1425
+ {
1426
+ "epoch": 0.28581485392467443,
1427
+ "grad_norm": 35.740665435791016,
1428
+ "learning_rate": 8.181630852596379e-05,
1429
+ "loss": 2.2235,
1430
+ "step": 203
1431
+ },
1432
+ {
1433
+ "epoch": 0.28722280887011614,
1434
+ "grad_norm": 26.734773635864258,
1435
+ "learning_rate": 8.164435898072947e-05,
1436
+ "loss": 1.9502,
1437
+ "step": 204
1438
+ },
1439
+ {
1440
+ "epoch": 0.2886307638155579,
1441
+ "grad_norm": 18.85955047607422,
1442
+ "learning_rate": 8.147178284193184e-05,
1443
+ "loss": 1.2279,
1444
+ "step": 205
1445
+ },
1446
+ {
1447
+ "epoch": 0.2900387187609996,
1448
+ "grad_norm": 17.08832550048828,
1449
+ "learning_rate": 8.129858352677076e-05,
1450
+ "loss": 1.1731,
1451
+ "step": 206
1452
+ },
1453
+ {
1454
+ "epoch": 0.2914466737064414,
1455
+ "grad_norm": 44.938209533691406,
1456
+ "learning_rate": 8.112476446478561e-05,
1457
+ "loss": 1.5057,
1458
+ "step": 207
1459
+ },
1460
+ {
1461
+ "epoch": 0.29285462865188316,
1462
+ "grad_norm": 43.36068344116211,
1463
+ "learning_rate": 8.095032909778748e-05,
1464
+ "loss": 1.0289,
1465
+ "step": 208
1466
+ },
1467
+ {
1468
+ "epoch": 0.2942625835973249,
1469
+ "grad_norm": 18.839191436767578,
1470
+ "learning_rate": 8.077528087979095e-05,
1471
+ "loss": 1.1196,
1472
+ "step": 209
1473
+ },
1474
+ {
1475
+ "epoch": 0.29567053854276665,
1476
+ "grad_norm": 20.551124572753906,
1477
+ "learning_rate": 8.05996232769458e-05,
1478
+ "loss": 1.1649,
1479
+ "step": 210
1480
+ },
1481
+ {
1482
+ "epoch": 0.29707849348820836,
1483
+ "grad_norm": 25.39565658569336,
1484
+ "learning_rate": 8.042335976746822e-05,
1485
+ "loss": 1.3211,
1486
+ "step": 211
1487
+ },
1488
+ {
1489
+ "epoch": 0.29848644843365013,
1490
+ "grad_norm": 14.85946273803711,
1491
+ "learning_rate": 8.024649384157212e-05,
1492
+ "loss": 0.6612,
1493
+ "step": 212
1494
+ },
1495
+ {
1496
+ "epoch": 0.29989440337909185,
1497
+ "grad_norm": 19.78711700439453,
1498
+ "learning_rate": 8.006902900139988e-05,
1499
+ "loss": 1.3747,
1500
+ "step": 213
1501
+ },
1502
+ {
1503
+ "epoch": 0.3013023583245336,
1504
+ "grad_norm": 11.568204879760742,
1505
+ "learning_rate": 7.989096876095303e-05,
1506
+ "loss": 1.1045,
1507
+ "step": 214
1508
+ },
1509
+ {
1510
+ "epoch": 0.3027103132699754,
1511
+ "grad_norm": 9.248348236083984,
1512
+ "learning_rate": 7.971231664602273e-05,
1513
+ "loss": 0.7642,
1514
+ "step": 215
1515
+ },
1516
+ {
1517
+ "epoch": 0.3041182682154171,
1518
+ "grad_norm": 5.907835960388184,
1519
+ "learning_rate": 7.953307619411987e-05,
1520
+ "loss": 0.3467,
1521
+ "step": 216
1522
+ },
1523
+ {
1524
+ "epoch": 0.30552622316085887,
1525
+ "grad_norm": 7.299631118774414,
1526
+ "learning_rate": 7.935325095440511e-05,
1527
+ "loss": 0.3262,
1528
+ "step": 217
1529
+ },
1530
+ {
1531
+ "epoch": 0.3069341781063006,
1532
+ "grad_norm": 11.987982749938965,
1533
+ "learning_rate": 7.917284448761854e-05,
1534
+ "loss": 0.6308,
1535
+ "step": 218
1536
+ },
1537
+ {
1538
+ "epoch": 0.30834213305174235,
1539
+ "grad_norm": 10.527399063110352,
1540
+ "learning_rate": 7.899186036600918e-05,
1541
+ "loss": 1.0346,
1542
+ "step": 219
1543
+ },
1544
+ {
1545
+ "epoch": 0.30975008799718406,
1546
+ "grad_norm": 45.371761322021484,
1547
+ "learning_rate": 7.88103021732643e-05,
1548
+ "loss": 2.1641,
1549
+ "step": 220
1550
+ },
1551
+ {
1552
+ "epoch": 0.31115804294262583,
1553
+ "grad_norm": 19.207111358642578,
1554
+ "learning_rate": 7.862817350443839e-05,
1555
+ "loss": 1.1476,
1556
+ "step": 221
1557
+ },
1558
+ {
1559
+ "epoch": 0.3125659978880676,
1560
+ "grad_norm": 59.24346160888672,
1561
+ "learning_rate": 7.8445477965882e-05,
1562
+ "loss": 1.891,
1563
+ "step": 222
1564
+ },
1565
+ {
1566
+ "epoch": 0.3139739528335093,
1567
+ "grad_norm": 8.811090469360352,
1568
+ "learning_rate": 7.826221917517034e-05,
1569
+ "loss": 0.5215,
1570
+ "step": 223
1571
+ },
1572
+ {
1573
+ "epoch": 0.3153819077789511,
1574
+ "grad_norm": 33.62078094482422,
1575
+ "learning_rate": 7.807840076103163e-05,
1576
+ "loss": 1.5825,
1577
+ "step": 224
1578
+ },
1579
+ {
1580
+ "epoch": 0.3167898627243928,
1581
+ "grad_norm": 31.49905014038086,
1582
+ "learning_rate": 7.789402636327525e-05,
1583
+ "loss": 1.084,
1584
+ "step": 225
1585
+ },
1586
+ {
1587
+ "epoch": 0.31819781766983457,
1588
+ "grad_norm": 11.637606620788574,
1589
+ "learning_rate": 7.770909963271972e-05,
1590
+ "loss": 0.7103,
1591
+ "step": 226
1592
+ },
1593
+ {
1594
+ "epoch": 0.3196057726152763,
1595
+ "grad_norm": 21.97195816040039,
1596
+ "learning_rate": 7.752362423112032e-05,
1597
+ "loss": 1.3204,
1598
+ "step": 227
1599
+ },
1600
+ {
1601
+ "epoch": 0.32101372756071805,
1602
+ "grad_norm": 18.698328018188477,
1603
+ "learning_rate": 7.733760383109665e-05,
1604
+ "loss": 0.9943,
1605
+ "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.3224216825061598,
1609
+ "grad_norm": 19.418983459472656,
1610
+ "learning_rate": 7.715104211605987e-05,
1611
+ "loss": 0.8113,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.32382963745160154,
1616
+ "grad_norm": 16.439781188964844,
1617
+ "learning_rate": 7.696394278013979e-05,
1618
+ "loss": 0.6862,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.3252375923970433,
1623
+ "grad_norm": 9.741683959960938,
1624
+ "learning_rate": 7.677630952811172e-05,
1625
+ "loss": 0.5196,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.326645547342485,
1630
+ "grad_norm": 8.357406616210938,
1631
+ "learning_rate": 7.65881460753231e-05,
1632
+ "loss": 0.5758,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.3280535022879268,
1637
+ "grad_norm": 33.68247604370117,
1638
+ "learning_rate": 7.63994561476199e-05,
1639
+ "loss": 1.8643,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.32946145723336856,
1644
+ "grad_norm": 41.963356018066406,
1645
+ "learning_rate": 7.621024348127295e-05,
1646
+ "loss": 2.0034,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.3308694121788103,
1651
+ "grad_norm": 20.343597412109375,
1652
+ "learning_rate": 7.602051182290382e-05,
1653
+ "loss": 1.3085,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.33227736712425204,
1658
+ "grad_norm": 19.10648536682129,
1659
+ "learning_rate": 7.583026492941075e-05,
1660
+ "loss": 1.1332,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.33368532206969376,
1665
+ "grad_norm": 35.396820068359375,
1666
+ "learning_rate": 7.563950656789416e-05,
1667
+ "loss": 1.2122,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.3350932770151355,
1672
+ "grad_norm": 27.122194290161133,
1673
+ "learning_rate": 7.544824051558217e-05,
1674
+ "loss": 1.5366,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.33650123196057724,
1679
+ "grad_norm": 16.57398223876953,
1680
+ "learning_rate": 7.525647055975567e-05,
1681
+ "loss": 1.3327,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.337909186906019,
1686
+ "grad_norm": 28.07727813720703,
1687
+ "learning_rate": 7.506420049767347e-05,
1688
+ "loss": 1.3062,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.3393171418514608,
1693
+ "grad_norm": 24.03339385986328,
1694
+ "learning_rate": 7.4871434136497e-05,
1695
+ "loss": 1.6172,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.3407250967969025,
1700
+ "grad_norm": 12.594011306762695,
1701
+ "learning_rate": 7.467817529321498e-05,
1702
+ "loss": 1.0874,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.34213305174234426,
1707
+ "grad_norm": 12.581107139587402,
1708
+ "learning_rate": 7.448442779456781e-05,
1709
+ "loss": 0.7604,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.343541006687786,
1714
+ "grad_norm": 46.649383544921875,
1715
+ "learning_rate": 7.429019547697183e-05,
1716
+ "loss": 1.3192,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.34494896163322775,
1721
+ "grad_norm": 16.450733184814453,
1722
+ "learning_rate": 7.409548218644332e-05,
1723
+ "loss": 1.1468,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.34635691657866946,
1728
+ "grad_norm": 16.57231903076172,
1729
+ "learning_rate": 7.390029177852237e-05,
1730
+ "loss": 1.0065,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.34776487152411123,
1735
+ "grad_norm": 27.110193252563477,
1736
+ "learning_rate": 7.37046281181965e-05,
1737
+ "loss": 0.9427,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.349172826469553,
1742
+ "grad_norm": 17.80980110168457,
1743
+ "learning_rate": 7.35084950798242e-05,
1744
+ "loss": 1.184,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.3505807814149947,
1749
+ "grad_norm": 63.84309387207031,
1750
+ "learning_rate": 7.331189654705815e-05,
1751
+ "loss": 1.1323,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.3519887363604365,
1756
+ "grad_norm": 104.79566955566406,
1757
+ "learning_rate": 7.311483641276833e-05,
1758
+ "loss": 3.3279,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.3533966913058782,
1763
+ "grad_norm": 20.52196502685547,
1764
+ "learning_rate": 7.291731857896492e-05,
1765
+ "loss": 0.7327,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.35480464625131997,
1770
+ "grad_norm": 10.260120391845703,
1771
+ "learning_rate": 7.271934695672112e-05,
1772
+ "loss": 0.9386,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.3562126011967617,
1777
+ "grad_norm": 16.224411010742188,
1778
+ "learning_rate": 7.252092546609558e-05,
1779
+ "loss": 1.2396,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.35762055614220345,
1784
+ "grad_norm": 39.729496002197266,
1785
+ "learning_rate": 7.23220580360549e-05,
1786
+ "loss": 1.3439,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.3590285110876452,
1791
+ "grad_norm": 10.264772415161133,
1792
+ "learning_rate": 7.212274860439576e-05,
1793
+ "loss": 0.7011,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.36043646603308693,
1798
+ "grad_norm": 18.108182907104492,
1799
+ "learning_rate": 7.192300111766696e-05,
1800
+ "loss": 0.8619,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.3618444209785287,
1805
+ "grad_norm": 61.82547378540039,
1806
+ "learning_rate": 7.172281953109128e-05,
1807
+ "loss": 1.6629,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.3632523759239704,
1812
+ "grad_norm": 29.251495361328125,
1813
+ "learning_rate": 7.152220780848713e-05,
1814
+ "loss": 0.8569,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.3646603308694122,
1819
+ "grad_norm": 15.582707405090332,
1820
+ "learning_rate": 7.132116992219015e-05,
1821
+ "loss": 1.1466,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.3660682858148539,
1826
+ "grad_norm": 26.372283935546875,
1827
+ "learning_rate": 7.111970985297446e-05,
1828
+ "loss": 1.9285,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.36747624076029567,
1833
+ "grad_norm": 27.972610473632812,
1834
+ "learning_rate": 7.09178315899739e-05,
1835
+ "loss": 1.5448,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.36888419570573744,
1840
+ "grad_norm": 16.148784637451172,
1841
+ "learning_rate": 7.071553913060296e-05,
1842
+ "loss": 1.3117,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.37029215065117915,
1847
+ "grad_norm": 15.186225891113281,
1848
+ "learning_rate": 7.051283648047776e-05,
1849
+ "loss": 1.1989,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.3717001055966209,
1854
+ "grad_norm": 11.998665809631348,
1855
+ "learning_rate": 7.030972765333655e-05,
1856
+ "loss": 0.8488,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.37310806054206264,
1861
+ "grad_norm": 22.099727630615234,
1862
+ "learning_rate": 7.010621667096041e-05,
1863
+ "loss": 0.5446,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.3745160154875044,
1868
+ "grad_norm": 22.740089416503906,
1869
+ "learning_rate": 6.990230756309355e-05,
1870
+ "loss": 0.7579,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.3759239704329461,
1875
+ "grad_norm": 21.371496200561523,
1876
+ "learning_rate": 6.969800436736347e-05,
1877
+ "loss": 0.8681,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.3773319253783879,
1882
+ "grad_norm": 5.607769966125488,
1883
+ "learning_rate": 6.949331112920106e-05,
1884
+ "loss": 0.3161,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.37873988032382966,
1889
+ "grad_norm": 23.731332778930664,
1890
+ "learning_rate": 6.928823190176051e-05,
1891
+ "loss": 1.8792,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.3801478352692714,
1896
+ "grad_norm": 23.237396240234375,
1897
+ "learning_rate": 6.908277074583895e-05,
1898
+ "loss": 1.7937,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.38155579021471314,
1903
+ "grad_norm": 15.7110595703125,
1904
+ "learning_rate": 6.887693172979623e-05,
1905
+ "loss": 1.28,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.38296374516015486,
1910
+ "grad_norm": 10.618301391601562,
1911
+ "learning_rate": 6.867071892947417e-05,
1912
+ "loss": 0.7942,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.3843717001055966,
1917
+ "grad_norm": 4.260011196136475,
1918
+ "learning_rate": 6.846413642811598e-05,
1919
+ "loss": 0.6144,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.38577965505103834,
1924
+ "grad_norm": 16.570066452026367,
1925
+ "learning_rate": 6.825718831628532e-05,
1926
+ "loss": 1.0566,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.3871876099964801,
1931
+ "grad_norm": 11.564096450805664,
1932
+ "learning_rate": 6.80498786917854e-05,
1933
+ "loss": 0.7064,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.3885955649419219,
1938
+ "grad_norm": 7.003159523010254,
1939
+ "learning_rate": 6.784221165957771e-05,
1940
+ "loss": 0.5354,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.3900035198873636,
1945
+ "grad_norm": 14.146154403686523,
1946
+ "learning_rate": 6.763419133170089e-05,
1947
+ "loss": 0.6516,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.39141147483280536,
1952
+ "grad_norm": 5.380551338195801,
1953
+ "learning_rate": 6.742582182718915e-05,
1954
+ "loss": 0.4163,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.3928194297782471,
1959
+ "grad_norm": 4.118032455444336,
1960
+ "learning_rate": 6.721710727199087e-05,
1961
+ "loss": 0.4664,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.39422738472368885,
1966
+ "grad_norm": 10.134897232055664,
1967
+ "learning_rate": 6.700805179888675e-05,
1968
+ "loss": 0.7125,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.3956353396691306,
1973
+ "grad_norm": 23.42674446105957,
1974
+ "learning_rate": 6.679865954740808e-05,
1975
+ "loss": 1.5089,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.39704329461457233,
1980
+ "grad_norm": 11.591614723205566,
1981
+ "learning_rate": 6.658893466375474e-05,
1982
+ "loss": 0.7875,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.3984512495600141,
1987
+ "grad_norm": 14.444954872131348,
1988
+ "learning_rate": 6.637888130071304e-05,
1989
+ "loss": 0.9125,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.3998592045054558,
1994
+ "grad_norm": 14.189472198486328,
1995
+ "learning_rate": 6.616850361757364e-05,
1996
+ "loss": 1.0319,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.4012671594508976,
2001
+ "grad_norm": 36.17814636230469,
2002
+ "learning_rate": 6.595780578004901e-05,
2003
+ "loss": 1.2356,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.4026751143963393,
2008
+ "grad_norm": 45.84112548828125,
2009
+ "learning_rate": 6.574679196019112e-05,
2010
+ "loss": 1.6351,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.40408306934178106,
2015
+ "grad_norm": 29.742900848388672,
2016
+ "learning_rate": 6.553546633630865e-05,
2017
+ "loss": 0.9819,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.40549102428722283,
2022
+ "grad_norm": 24.932228088378906,
2023
+ "learning_rate": 6.532383309288443e-05,
2024
+ "loss": 0.9133,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.40689897923266455,
2029
+ "grad_norm": 42.26826477050781,
2030
+ "learning_rate": 6.511189642049244e-05,
2031
+ "loss": 1.1545,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.4083069341781063,
2036
+ "grad_norm": 18.30242156982422,
2037
+ "learning_rate": 6.489966051571492e-05,
2038
+ "loss": 1.4322,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.40971488912354803,
2043
+ "grad_norm": 55.628604888916016,
2044
+ "learning_rate": 6.468712958105926e-05,
2045
+ "loss": 2.6193,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.4111228440689898,
2050
+ "grad_norm": 150.41041564941406,
2051
+ "learning_rate": 6.447430782487473e-05,
2052
+ "loss": 2.5549,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.4125307990144315,
2057
+ "grad_norm": 37.905914306640625,
2058
+ "learning_rate": 6.426119946126921e-05,
2059
+ "loss": 1.6198,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.4139387539598733,
2064
+ "grad_norm": 28.293848037719727,
2065
+ "learning_rate": 6.404780871002575e-05,
2066
+ "loss": 0.9741,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.41534670890531505,
2071
+ "grad_norm": 25.466543197631836,
2072
+ "learning_rate": 6.383413979651894e-05,
2073
+ "loss": 1.0264,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.41675466385075677,
2078
+ "grad_norm": 16.458620071411133,
2079
+ "learning_rate": 6.362019695163132e-05,
2080
+ "loss": 0.9326,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.41816261879619854,
2085
+ "grad_norm": 9.713531494140625,
2086
+ "learning_rate": 6.340598441166958e-05,
2087
+ "loss": 0.5798,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.41957057374164025,
2092
+ "grad_norm": 50.1083869934082,
2093
+ "learning_rate": 6.31915064182807e-05,
2094
+ "loss": 1.43,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.420978528687082,
2099
+ "grad_norm": 40.683753967285156,
2100
+ "learning_rate": 6.297676721836784e-05,
2101
+ "loss": 1.4706,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.42238648363252373,
2106
+ "grad_norm": 43.749794006347656,
2107
+ "learning_rate": 6.276177106400647e-05,
2108
+ "loss": 3.0438,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.4237944385779655,
2113
+ "grad_norm": 8.859760284423828,
2114
+ "learning_rate": 6.254652221235992e-05,
2115
+ "loss": 0.2372,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.4252023935234073,
2120
+ "grad_norm": 6.741225719451904,
2121
+ "learning_rate": 6.23310249255953e-05,
2122
+ "loss": 0.5755,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.426610348468849,
2127
+ "grad_norm": 21.322010040283203,
2128
+ "learning_rate": 6.211528347079896e-05,
2129
+ "loss": 1.0998,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.42801830341429076,
2134
+ "grad_norm": 11.496692657470703,
2135
+ "learning_rate": 6.189930211989209e-05,
2136
+ "loss": 0.8235,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.42942625835973247,
2141
+ "grad_norm": 7.478254318237305,
2142
+ "learning_rate": 6.168308514954602e-05,
2143
+ "loss": 0.6988,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.43083421330517424,
2148
+ "grad_norm": 16.896244049072266,
2149
+ "learning_rate": 6.146663684109773e-05,
2150
+ "loss": 1.1964,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.43224216825061595,
2155
+ "grad_norm": 14.001093864440918,
2156
+ "learning_rate": 6.124996148046478e-05,
2157
+ "loss": 1.1556,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.4336501231960577,
2162
+ "grad_norm": 15.864665031433105,
2163
+ "learning_rate": 6.103306335806077e-05,
2164
+ "loss": 1.031,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.4350580781414995,
2169
+ "grad_norm": 17.03790855407715,
2170
+ "learning_rate": 6.0815946768710176e-05,
2171
+ "loss": 0.676,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.4364660330869412,
2176
+ "grad_norm": 12.771721839904785,
2177
+ "learning_rate": 6.0598616011563324e-05,
2178
+ "loss": 0.8617,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.437873988032383,
2183
+ "grad_norm": 9.611161231994629,
2184
+ "learning_rate": 6.038107539001139e-05,
2185
+ "loss": 0.7115,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 0.4392819429778247,
2190
+ "grad_norm": 23.515108108520508,
2191
+ "learning_rate": 6.016332921160099e-05,
2192
+ "loss": 1.2061,
2193
+ "step": 312
2194
+ },
2195
+ {
2196
+ "epoch": 0.44068989792326646,
2197
+ "grad_norm": 10.848517417907715,
2198
+ "learning_rate": 5.99453817879491e-05,
2199
+ "loss": 0.5806,
2200
+ "step": 313
2201
+ },
2202
+ {
2203
+ "epoch": 0.4420978528687082,
2204
+ "grad_norm": 9.37192153930664,
2205
+ "learning_rate": 5.972723743465749e-05,
2206
+ "loss": 0.4661,
2207
+ "step": 314
2208
+ },
2209
+ {
2210
+ "epoch": 0.44350580781414994,
2211
+ "grad_norm": 19.835905075073242,
2212
+ "learning_rate": 5.9508900471227416e-05,
2213
+ "loss": 0.7906,
2214
+ "step": 315
2215
+ },
2216
+ {
2217
+ "epoch": 0.4449137627595917,
2218
+ "grad_norm": 10.338900566101074,
2219
+ "learning_rate": 5.9290375220973985e-05,
2220
+ "loss": 0.5141,
2221
+ "step": 316
2222
+ },
2223
+ {
2224
+ "epoch": 0.4463217177050334,
2225
+ "grad_norm": 7.031118869781494,
2226
+ "learning_rate": 5.907166601094063e-05,
2227
+ "loss": 0.6734,
2228
+ "step": 317
2229
+ },
2230
+ {
2231
+ "epoch": 0.4477296726504752,
2232
+ "grad_norm": 12.128201484680176,
2233
+ "learning_rate": 5.885277717181338e-05,
2234
+ "loss": 0.808,
2235
+ "step": 318
2236
+ },
2237
+ {
2238
+ "epoch": 0.4491376275959169,
2239
+ "grad_norm": 18.01626968383789,
2240
+ "learning_rate": 5.86337130378351e-05,
2241
+ "loss": 0.9725,
2242
+ "step": 319
2243
+ },
2244
+ {
2245
+ "epoch": 0.4505455825413587,
2246
+ "grad_norm": 19.51142692565918,
2247
+ "learning_rate": 5.8414477946719725e-05,
2248
+ "loss": 1.0292,
2249
+ "step": 320
2250
+ },
2251
+ {
2252
+ "epoch": 0.45195353748680045,
2253
+ "grad_norm": 13.940886497497559,
2254
+ "learning_rate": 5.8195076239566304e-05,
2255
+ "loss": 0.7187,
2256
+ "step": 321
2257
+ },
2258
+ {
2259
+ "epoch": 0.45336149243224216,
2260
+ "grad_norm": 9.016057014465332,
2261
+ "learning_rate": 5.797551226077308e-05,
2262
+ "loss": 0.5587,
2263
+ "step": 322
2264
+ },
2265
+ {
2266
+ "epoch": 0.45476944737768393,
2267
+ "grad_norm": 7.983224391937256,
2268
+ "learning_rate": 5.775579035795145e-05,
2269
+ "loss": 0.469,
2270
+ "step": 323
2271
+ },
2272
+ {
2273
+ "epoch": 0.45617740232312565,
2274
+ "grad_norm": 11.286262512207031,
2275
+ "learning_rate": 5.753591488183988e-05,
2276
+ "loss": 0.7711,
2277
+ "step": 324
2278
+ },
2279
+ {
2280
+ "epoch": 0.4575853572685674,
2281
+ "grad_norm": 28.39682960510254,
2282
+ "learning_rate": 5.731589018621777e-05,
2283
+ "loss": 1.1854,
2284
+ "step": 325
2285
+ },
2286
+ {
2287
+ "epoch": 0.45899331221400913,
2288
+ "grad_norm": 11.899818420410156,
2289
+ "learning_rate": 5.709572062781924e-05,
2290
+ "loss": 0.7849,
2291
+ "step": 326
2292
+ },
2293
+ {
2294
+ "epoch": 0.4604012671594509,
2295
+ "grad_norm": 6.919939994812012,
2296
+ "learning_rate": 5.68754105662468e-05,
2297
+ "loss": 0.5996,
2298
+ "step": 327
2299
+ },
2300
+ {
2301
+ "epoch": 0.46180922210489267,
2302
+ "grad_norm": 7.084524154663086,
2303
+ "learning_rate": 5.665496436388515e-05,
2304
+ "loss": 0.3707,
2305
+ "step": 328
2306
+ },
2307
+ {
2308
+ "epoch": 0.4632171770503344,
2309
+ "grad_norm": 3.3268368244171143,
2310
+ "learning_rate": 5.643438638581472e-05,
2311
+ "loss": 0.184,
2312
+ "step": 329
2313
+ },
2314
+ {
2315
+ "epoch": 0.46462513199577615,
2316
+ "grad_norm": 9.73562240600586,
2317
+ "learning_rate": 5.621368099972519e-05,
2318
+ "loss": 0.6516,
2319
+ "step": 330
2320
+ },
2321
+ {
2322
+ "epoch": 0.46603308694121787,
2323
+ "grad_norm": 18.59488296508789,
2324
+ "learning_rate": 5.599285257582911e-05,
2325
+ "loss": 1.3035,
2326
+ "step": 331
2327
+ },
2328
+ {
2329
+ "epoch": 0.46744104188665964,
2330
+ "grad_norm": 6.1787543296813965,
2331
+ "learning_rate": 5.577190548677529e-05,
2332
+ "loss": 0.3772,
2333
+ "step": 332
2334
+ },
2335
+ {
2336
+ "epoch": 0.46884899683210135,
2337
+ "grad_norm": 13.894497871398926,
2338
+ "learning_rate": 5.555084410756226e-05,
2339
+ "loss": 0.6582,
2340
+ "step": 333
2341
+ },
2342
+ {
2343
+ "epoch": 0.4702569517775431,
2344
+ "grad_norm": 28.024072647094727,
2345
+ "learning_rate": 5.532967281545162e-05,
2346
+ "loss": 1.1448,
2347
+ "step": 334
2348
+ },
2349
+ {
2350
+ "epoch": 0.4716649067229849,
2351
+ "grad_norm": 13.692731857299805,
2352
+ "learning_rate": 5.510839598988137e-05,
2353
+ "loss": 1.0288,
2354
+ "step": 335
2355
+ },
2356
+ {
2357
+ "epoch": 0.4730728616684266,
2358
+ "grad_norm": 16.94679832458496,
2359
+ "learning_rate": 5.4887018012379164e-05,
2360
+ "loss": 0.8068,
2361
+ "step": 336
2362
+ },
2363
+ {
2364
+ "epoch": 0.4744808166138684,
2365
+ "grad_norm": 46.81898880004883,
2366
+ "learning_rate": 5.466554326647564e-05,
2367
+ "loss": 1.5328,
2368
+ "step": 337
2369
+ },
2370
+ {
2371
+ "epoch": 0.4758887715593101,
2372
+ "grad_norm": 23.71940803527832,
2373
+ "learning_rate": 5.4443976137617447e-05,
2374
+ "loss": 1.0762,
2375
+ "step": 338
2376
+ },
2377
+ {
2378
+ "epoch": 0.47729672650475186,
2379
+ "grad_norm": 16.341949462890625,
2380
+ "learning_rate": 5.422232101308064e-05,
2381
+ "loss": 0.7343,
2382
+ "step": 339
2383
+ },
2384
+ {
2385
+ "epoch": 0.47870468145019357,
2386
+ "grad_norm": 11.016429901123047,
2387
+ "learning_rate": 5.400058228188363e-05,
2388
+ "loss": 0.9309,
2389
+ "step": 340
2390
+ },
2391
+ {
2392
+ "epoch": 0.48011263639563534,
2393
+ "grad_norm": 17.73030662536621,
2394
+ "learning_rate": 5.377876433470031e-05,
2395
+ "loss": 0.9843,
2396
+ "step": 341
2397
+ },
2398
+ {
2399
+ "epoch": 0.4815205913410771,
2400
+ "grad_norm": 28.34784698486328,
2401
+ "learning_rate": 5.355687156377319e-05,
2402
+ "loss": 0.8469,
2403
+ "step": 342
2404
+ },
2405
+ {
2406
+ "epoch": 0.4829285462865188,
2407
+ "grad_norm": 15.134780883789062,
2408
+ "learning_rate": 5.333490836282635e-05,
2409
+ "loss": 1.1461,
2410
+ "step": 343
2411
+ },
2412
+ {
2413
+ "epoch": 0.4843365012319606,
2414
+ "grad_norm": 11.261271476745605,
2415
+ "learning_rate": 5.311287912697845e-05,
2416
+ "loss": 0.8169,
2417
+ "step": 344
2418
+ },
2419
+ {
2420
+ "epoch": 0.4857444561774023,
2421
+ "grad_norm": 13.379313468933105,
2422
+ "learning_rate": 5.289078825265573e-05,
2423
+ "loss": 0.6616,
2424
+ "step": 345
2425
+ },
2426
+ {
2427
+ "epoch": 0.4871524111228441,
2428
+ "grad_norm": 35.41781997680664,
2429
+ "learning_rate": 5.266864013750493e-05,
2430
+ "loss": 1.8836,
2431
+ "step": 346
2432
+ },
2433
+ {
2434
+ "epoch": 0.4885603660682858,
2435
+ "grad_norm": 12.9522705078125,
2436
+ "learning_rate": 5.244643918030623e-05,
2437
+ "loss": 0.4605,
2438
+ "step": 347
2439
+ },
2440
+ {
2441
+ "epoch": 0.48996832101372756,
2442
+ "grad_norm": 22.15134620666504,
2443
+ "learning_rate": 5.222418978088614e-05,
2444
+ "loss": 0.8405,
2445
+ "step": 348
2446
+ },
2447
+ {
2448
+ "epoch": 0.49137627595916933,
2449
+ "grad_norm": 26.150135040283203,
2450
+ "learning_rate": 5.200189634003041e-05,
2451
+ "loss": 1.0932,
2452
+ "step": 349
2453
+ },
2454
+ {
2455
+ "epoch": 0.49278423090461104,
2456
+ "grad_norm": 55.37350845336914,
2457
+ "learning_rate": 5.177956325939678e-05,
2458
+ "loss": 2.3808,
2459
+ "step": 350
2460
+ },
2461
+ {
2462
+ "epoch": 0.4941921858500528,
2463
+ "grad_norm": 20.655296325683594,
2464
+ "learning_rate": 5.155719494142799e-05,
2465
+ "loss": 0.4508,
2466
+ "step": 351
2467
+ },
2468
+ {
2469
+ "epoch": 0.4956001407954945,
2470
+ "grad_norm": 6.21237850189209,
2471
+ "learning_rate": 5.133479578926445e-05,
2472
+ "loss": 0.5059,
2473
+ "step": 352
2474
+ },
2475
+ {
2476
+ "epoch": 0.4970080957409363,
2477
+ "grad_norm": 42.47819519042969,
2478
+ "learning_rate": 5.111237020665718e-05,
2479
+ "loss": 1.501,
2480
+ "step": 353
2481
+ },
2482
+ {
2483
+ "epoch": 0.498416050686378,
2484
+ "grad_norm": 23.130695343017578,
2485
+ "learning_rate": 5.088992259788049e-05,
2486
+ "loss": 1.1172,
2487
+ "step": 354
2488
+ },
2489
+ {
2490
+ "epoch": 0.4998240056318198,
2491
+ "grad_norm": 18.151142120361328,
2492
+ "learning_rate": 5.06674573676449e-05,
2493
+ "loss": 0.8362,
2494
+ "step": 355
2495
+ },
2496
+ {
2497
+ "epoch": 0.5012319605772615,
2498
+ "grad_norm": 17.91101837158203,
2499
+ "learning_rate": 5.044497892100981e-05,
2500
+ "loss": 0.5968,
2501
+ "step": 356
2502
  }
2503
  ],
2504
  "logging_steps": 1,
 
2518
  "attributes": {}
2519
  }
2520
  },
2521
+ "total_flos": 3.720766820701962e+17,
2522
  "train_batch_size": 4,
2523
  "trial_name": null,
2524
  "trial_params": null