CocoRoF commited on
Commit
f2d52e6
·
verified ·
1 Parent(s): b5c5b7b

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d194ff2106a95e15e685172ae6a47d00ea660de0a4893795d6fd1b50dac4bdec
3
  size 791781368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eccb0e2bd66b47391dd89be4927a9aed1333b9e1e0b7d9f1951a4aaeb35401e8
3
  size 791781368
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0305990c18f63d6932eb2ee19ed615ed5c3dd288a5e26e6a3ac3db1ad85e2336
3
  size 2375487866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12fb28e52c48bea00a85883fac40f131eedf9cbb7d2564050dd0b64aff066681
3
  size 2375487866
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d25a66bce66c09652383a0839fbbe1e8187db02fbdfb7a47d70ae88b467cd642
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22704cb68ce2d335ed3aa00237997dfcf49180d12a11fa5fb434d67c714d2da
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3447ca6cc16e14aea98b2269a3aa4e5446a87729acbfec0746d24b1cbb51286
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd44b9ad3ef5591f2a0671f1ec04c21ad479cbd1d478859e3ba017f1c74bf027
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4381101025451459,
5
  "eval_steps": 1000,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1415,6 +1415,1414 @@
1415
  "eval_samples_per_second": 998.04,
1416
  "eval_steps_per_second": 31.191,
1417
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1418
  }
1419
  ],
1420
  "logging_steps": 5,
@@ -1434,7 +2842,7 @@
1434
  "attributes": {}
1435
  }
1436
  },
1437
- "total_flos": 8.664715126584115e+18,
1438
  "train_batch_size": 8,
1439
  "trial_name": null,
1440
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8762202050902917,
5
  "eval_steps": 1000,
6
+ "global_step": 2000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1415
  "eval_samples_per_second": 998.04,
1416
  "eval_steps_per_second": 31.191,
1417
  "step": 1000
1418
+ },
1419
+ {
1420
+ "epoch": 0.4403006530578716,
1421
+ "grad_norm": 158.5,
1422
+ "learning_rate": 1.2440331222601072e-05,
1423
+ "loss": 25.968,
1424
+ "step": 1005
1425
+ },
1426
+ {
1427
+ "epoch": 0.44249120357059735,
1428
+ "grad_norm": 66.1875,
1429
+ "learning_rate": 1.2391622016561131e-05,
1430
+ "loss": 25.9174,
1431
+ "step": 1010
1432
+ },
1433
+ {
1434
+ "epoch": 0.44468175408332306,
1435
+ "grad_norm": 68.75,
1436
+ "learning_rate": 1.234291281052119e-05,
1437
+ "loss": 25.4954,
1438
+ "step": 1015
1439
+ },
1440
+ {
1441
+ "epoch": 0.4468723045960488,
1442
+ "grad_norm": 130.75,
1443
+ "learning_rate": 1.2294203604481248e-05,
1444
+ "loss": 25.5188,
1445
+ "step": 1020
1446
+ },
1447
+ {
1448
+ "epoch": 0.44906285510877453,
1449
+ "grad_norm": 112.3125,
1450
+ "learning_rate": 1.2245494398441307e-05,
1451
+ "loss": 25.8739,
1452
+ "step": 1025
1453
+ },
1454
+ {
1455
+ "epoch": 0.45125340562150024,
1456
+ "grad_norm": 117.0,
1457
+ "learning_rate": 1.2196785192401366e-05,
1458
+ "loss": 25.0384,
1459
+ "step": 1030
1460
+ },
1461
+ {
1462
+ "epoch": 0.453443956134226,
1463
+ "grad_norm": 77.375,
1464
+ "learning_rate": 1.2148075986361424e-05,
1465
+ "loss": 25.6608,
1466
+ "step": 1035
1467
+ },
1468
+ {
1469
+ "epoch": 0.4556345066469517,
1470
+ "grad_norm": 60.34375,
1471
+ "learning_rate": 1.2099366780321483e-05,
1472
+ "loss": 25.7742,
1473
+ "step": 1040
1474
+ },
1475
+ {
1476
+ "epoch": 0.4578250571596774,
1477
+ "grad_norm": 78.5625,
1478
+ "learning_rate": 1.2050657574281542e-05,
1479
+ "loss": 25.5131,
1480
+ "step": 1045
1481
+ },
1482
+ {
1483
+ "epoch": 0.4600156076724032,
1484
+ "grad_norm": 115.1875,
1485
+ "learning_rate": 1.2001948368241598e-05,
1486
+ "loss": 25.5009,
1487
+ "step": 1050
1488
+ },
1489
+ {
1490
+ "epoch": 0.4622061581851289,
1491
+ "grad_norm": 118.4375,
1492
+ "learning_rate": 1.1953239162201657e-05,
1493
+ "loss": 25.7924,
1494
+ "step": 1055
1495
+ },
1496
+ {
1497
+ "epoch": 0.46439670869785465,
1498
+ "grad_norm": 68.375,
1499
+ "learning_rate": 1.1904529956161715e-05,
1500
+ "loss": 25.9603,
1501
+ "step": 1060
1502
+ },
1503
+ {
1504
+ "epoch": 0.46658725921058036,
1505
+ "grad_norm": 121.375,
1506
+ "learning_rate": 1.1855820750121774e-05,
1507
+ "loss": 25.6717,
1508
+ "step": 1065
1509
+ },
1510
+ {
1511
+ "epoch": 0.46877780972330607,
1512
+ "grad_norm": 83.6875,
1513
+ "learning_rate": 1.1807111544081833e-05,
1514
+ "loss": 25.6563,
1515
+ "step": 1070
1516
+ },
1517
+ {
1518
+ "epoch": 0.47096836023603184,
1519
+ "grad_norm": 64.5625,
1520
+ "learning_rate": 1.175840233804189e-05,
1521
+ "loss": 26.0566,
1522
+ "step": 1075
1523
+ },
1524
+ {
1525
+ "epoch": 0.47315891074875754,
1526
+ "grad_norm": 81.6875,
1527
+ "learning_rate": 1.170969313200195e-05,
1528
+ "loss": 25.7885,
1529
+ "step": 1080
1530
+ },
1531
+ {
1532
+ "epoch": 0.47534946126148325,
1533
+ "grad_norm": 100.1875,
1534
+ "learning_rate": 1.1660983925962009e-05,
1535
+ "loss": 25.5735,
1536
+ "step": 1085
1537
+ },
1538
+ {
1539
+ "epoch": 0.477540011774209,
1540
+ "grad_norm": 116.0,
1541
+ "learning_rate": 1.1612274719922065e-05,
1542
+ "loss": 25.7891,
1543
+ "step": 1090
1544
+ },
1545
+ {
1546
+ "epoch": 0.4797305622869347,
1547
+ "grad_norm": 70.125,
1548
+ "learning_rate": 1.1563565513882124e-05,
1549
+ "loss": 25.6365,
1550
+ "step": 1095
1551
+ },
1552
+ {
1553
+ "epoch": 0.4819211127996605,
1554
+ "grad_norm": 52.96875,
1555
+ "learning_rate": 1.1514856307842183e-05,
1556
+ "loss": 25.5813,
1557
+ "step": 1100
1558
+ },
1559
+ {
1560
+ "epoch": 0.4841116633123862,
1561
+ "grad_norm": 150.125,
1562
+ "learning_rate": 1.1466147101802241e-05,
1563
+ "loss": 26.0147,
1564
+ "step": 1105
1565
+ },
1566
+ {
1567
+ "epoch": 0.4863022138251119,
1568
+ "grad_norm": 125.0625,
1569
+ "learning_rate": 1.14174378957623e-05,
1570
+ "loss": 26.1209,
1571
+ "step": 1110
1572
+ },
1573
+ {
1574
+ "epoch": 0.48849276433783767,
1575
+ "grad_norm": 89.875,
1576
+ "learning_rate": 1.136872868972236e-05,
1577
+ "loss": 25.7375,
1578
+ "step": 1115
1579
+ },
1580
+ {
1581
+ "epoch": 0.4906833148505634,
1582
+ "grad_norm": 125.5625,
1583
+ "learning_rate": 1.1320019483682417e-05,
1584
+ "loss": 25.246,
1585
+ "step": 1120
1586
+ },
1587
+ {
1588
+ "epoch": 0.4928738653632891,
1589
+ "grad_norm": 96.625,
1590
+ "learning_rate": 1.1271310277642476e-05,
1591
+ "loss": 25.7476,
1592
+ "step": 1125
1593
+ },
1594
+ {
1595
+ "epoch": 0.49506441587601485,
1596
+ "grad_norm": 148.375,
1597
+ "learning_rate": 1.1222601071602535e-05,
1598
+ "loss": 25.0299,
1599
+ "step": 1130
1600
+ },
1601
+ {
1602
+ "epoch": 0.49725496638874056,
1603
+ "grad_norm": 131.25,
1604
+ "learning_rate": 1.1173891865562591e-05,
1605
+ "loss": 25.7002,
1606
+ "step": 1135
1607
+ },
1608
+ {
1609
+ "epoch": 0.4994455169014663,
1610
+ "grad_norm": 99.0,
1611
+ "learning_rate": 1.112518265952265e-05,
1612
+ "loss": 25.3308,
1613
+ "step": 1140
1614
+ },
1615
+ {
1616
+ "epoch": 0.501636067414192,
1617
+ "grad_norm": 52.90625,
1618
+ "learning_rate": 1.107647345348271e-05,
1619
+ "loss": 25.7107,
1620
+ "step": 1145
1621
+ },
1622
+ {
1623
+ "epoch": 0.5038266179269177,
1624
+ "grad_norm": 92.375,
1625
+ "learning_rate": 1.1027764247442767e-05,
1626
+ "loss": 25.6609,
1627
+ "step": 1150
1628
+ },
1629
+ {
1630
+ "epoch": 0.5060171684396435,
1631
+ "grad_norm": 59.46875,
1632
+ "learning_rate": 1.0979055041402826e-05,
1633
+ "loss": 25.5677,
1634
+ "step": 1155
1635
+ },
1636
+ {
1637
+ "epoch": 0.5082077189523693,
1638
+ "grad_norm": 96.375,
1639
+ "learning_rate": 1.0930345835362886e-05,
1640
+ "loss": 25.3615,
1641
+ "step": 1160
1642
+ },
1643
+ {
1644
+ "epoch": 0.5103982694650949,
1645
+ "grad_norm": 75.0,
1646
+ "learning_rate": 1.0881636629322943e-05,
1647
+ "loss": 25.4644,
1648
+ "step": 1165
1649
+ },
1650
+ {
1651
+ "epoch": 0.5125888199778207,
1652
+ "grad_norm": 98.5,
1653
+ "learning_rate": 1.0832927423283002e-05,
1654
+ "loss": 25.6878,
1655
+ "step": 1170
1656
+ },
1657
+ {
1658
+ "epoch": 0.5147793704905465,
1659
+ "grad_norm": 76.1875,
1660
+ "learning_rate": 1.0784218217243058e-05,
1661
+ "loss": 25.8905,
1662
+ "step": 1175
1663
+ },
1664
+ {
1665
+ "epoch": 0.5169699210032721,
1666
+ "grad_norm": 52.4375,
1667
+ "learning_rate": 1.0735509011203117e-05,
1668
+ "loss": 25.7829,
1669
+ "step": 1180
1670
+ },
1671
+ {
1672
+ "epoch": 0.5191604715159979,
1673
+ "grad_norm": 105.625,
1674
+ "learning_rate": 1.0686799805163177e-05,
1675
+ "loss": 25.244,
1676
+ "step": 1185
1677
+ },
1678
+ {
1679
+ "epoch": 0.5213510220287236,
1680
+ "grad_norm": 59.84375,
1681
+ "learning_rate": 1.0638090599123234e-05,
1682
+ "loss": 25.1215,
1683
+ "step": 1190
1684
+ },
1685
+ {
1686
+ "epoch": 0.5235415725414493,
1687
+ "grad_norm": 54.03125,
1688
+ "learning_rate": 1.0589381393083293e-05,
1689
+ "loss": 25.3532,
1690
+ "step": 1195
1691
+ },
1692
+ {
1693
+ "epoch": 0.525732123054175,
1694
+ "grad_norm": 134.0,
1695
+ "learning_rate": 1.0540672187043353e-05,
1696
+ "loss": 24.9972,
1697
+ "step": 1200
1698
+ },
1699
+ {
1700
+ "epoch": 0.5279226735669008,
1701
+ "grad_norm": 77.5625,
1702
+ "learning_rate": 1.049196298100341e-05,
1703
+ "loss": 24.5776,
1704
+ "step": 1205
1705
+ },
1706
+ {
1707
+ "epoch": 0.5301132240796265,
1708
+ "grad_norm": 163.25,
1709
+ "learning_rate": 1.044325377496347e-05,
1710
+ "loss": 25.7603,
1711
+ "step": 1210
1712
+ },
1713
+ {
1714
+ "epoch": 0.5323037745923522,
1715
+ "grad_norm": 77.4375,
1716
+ "learning_rate": 1.0394544568923529e-05,
1717
+ "loss": 24.9012,
1718
+ "step": 1215
1719
+ },
1720
+ {
1721
+ "epoch": 0.534494325105078,
1722
+ "grad_norm": 101.25,
1723
+ "learning_rate": 1.0345835362883584e-05,
1724
+ "loss": 25.5309,
1725
+ "step": 1220
1726
+ },
1727
+ {
1728
+ "epoch": 0.5366848756178036,
1729
+ "grad_norm": 127.625,
1730
+ "learning_rate": 1.0297126156843644e-05,
1731
+ "loss": 25.7696,
1732
+ "step": 1225
1733
+ },
1734
+ {
1735
+ "epoch": 0.5388754261305294,
1736
+ "grad_norm": 79.75,
1737
+ "learning_rate": 1.0248416950803703e-05,
1738
+ "loss": 25.5109,
1739
+ "step": 1230
1740
+ },
1741
+ {
1742
+ "epoch": 0.5410659766432552,
1743
+ "grad_norm": 151.0,
1744
+ "learning_rate": 1.019970774476376e-05,
1745
+ "loss": 25.7191,
1746
+ "step": 1235
1747
+ },
1748
+ {
1749
+ "epoch": 0.5432565271559809,
1750
+ "grad_norm": 54.0,
1751
+ "learning_rate": 1.015099853872382e-05,
1752
+ "loss": 25.1186,
1753
+ "step": 1240
1754
+ },
1755
+ {
1756
+ "epoch": 0.5454470776687066,
1757
+ "grad_norm": 54.3125,
1758
+ "learning_rate": 1.0102289332683879e-05,
1759
+ "loss": 24.919,
1760
+ "step": 1245
1761
+ },
1762
+ {
1763
+ "epoch": 0.5476376281814324,
1764
+ "grad_norm": 106.8125,
1765
+ "learning_rate": 1.0053580126643936e-05,
1766
+ "loss": 25.1853,
1767
+ "step": 1250
1768
+ },
1769
+ {
1770
+ "epoch": 0.5498281786941581,
1771
+ "grad_norm": 97.75,
1772
+ "learning_rate": 1.0004870920603996e-05,
1773
+ "loss": 24.8172,
1774
+ "step": 1255
1775
+ },
1776
+ {
1777
+ "epoch": 0.5520187292068838,
1778
+ "grad_norm": 67.8125,
1779
+ "learning_rate": 9.956161714564053e-06,
1780
+ "loss": 24.8634,
1781
+ "step": 1260
1782
+ },
1783
+ {
1784
+ "epoch": 0.5542092797196095,
1785
+ "grad_norm": 169.125,
1786
+ "learning_rate": 9.90745250852411e-06,
1787
+ "loss": 25.2829,
1788
+ "step": 1265
1789
+ },
1790
+ {
1791
+ "epoch": 0.5563998302323353,
1792
+ "grad_norm": 116.375,
1793
+ "learning_rate": 9.85874330248417e-06,
1794
+ "loss": 25.702,
1795
+ "step": 1270
1796
+ },
1797
+ {
1798
+ "epoch": 0.558590380745061,
1799
+ "grad_norm": 229.125,
1800
+ "learning_rate": 9.810034096444229e-06,
1801
+ "loss": 24.9604,
1802
+ "step": 1275
1803
+ },
1804
+ {
1805
+ "epoch": 0.5607809312577867,
1806
+ "grad_norm": 51.5625,
1807
+ "learning_rate": 9.761324890404287e-06,
1808
+ "loss": 24.766,
1809
+ "step": 1280
1810
+ },
1811
+ {
1812
+ "epoch": 0.5629714817705125,
1813
+ "grad_norm": 88.5625,
1814
+ "learning_rate": 9.712615684364346e-06,
1815
+ "loss": 24.9662,
1816
+ "step": 1285
1817
+ },
1818
+ {
1819
+ "epoch": 0.5651620322832381,
1820
+ "grad_norm": 92.375,
1821
+ "learning_rate": 9.663906478324403e-06,
1822
+ "loss": 25.0194,
1823
+ "step": 1290
1824
+ },
1825
+ {
1826
+ "epoch": 0.5673525827959639,
1827
+ "grad_norm": 83.8125,
1828
+ "learning_rate": 9.615197272284463e-06,
1829
+ "loss": 25.4854,
1830
+ "step": 1295
1831
+ },
1832
+ {
1833
+ "epoch": 0.5695431333086897,
1834
+ "grad_norm": 76.75,
1835
+ "learning_rate": 9.566488066244522e-06,
1836
+ "loss": 24.8307,
1837
+ "step": 1300
1838
+ },
1839
+ {
1840
+ "epoch": 0.5717336838214153,
1841
+ "grad_norm": 92.75,
1842
+ "learning_rate": 9.51777886020458e-06,
1843
+ "loss": 24.5626,
1844
+ "step": 1305
1845
+ },
1846
+ {
1847
+ "epoch": 0.5739242343341411,
1848
+ "grad_norm": 98.6875,
1849
+ "learning_rate": 9.469069654164637e-06,
1850
+ "loss": 24.9844,
1851
+ "step": 1310
1852
+ },
1853
+ {
1854
+ "epoch": 0.5761147848468668,
1855
+ "grad_norm": 63.28125,
1856
+ "learning_rate": 9.420360448124696e-06,
1857
+ "loss": 25.0084,
1858
+ "step": 1315
1859
+ },
1860
+ {
1861
+ "epoch": 0.5783053353595926,
1862
+ "grad_norm": 272.75,
1863
+ "learning_rate": 9.371651242084755e-06,
1864
+ "loss": 25.2756,
1865
+ "step": 1320
1866
+ },
1867
+ {
1868
+ "epoch": 0.5804958858723183,
1869
+ "grad_norm": 214.125,
1870
+ "learning_rate": 9.322942036044813e-06,
1871
+ "loss": 25.1908,
1872
+ "step": 1325
1873
+ },
1874
+ {
1875
+ "epoch": 0.582686436385044,
1876
+ "grad_norm": 93.875,
1877
+ "learning_rate": 9.27423283000487e-06,
1878
+ "loss": 24.5392,
1879
+ "step": 1330
1880
+ },
1881
+ {
1882
+ "epoch": 0.5848769868977698,
1883
+ "grad_norm": 119.5625,
1884
+ "learning_rate": 9.22552362396493e-06,
1885
+ "loss": 25.1336,
1886
+ "step": 1335
1887
+ },
1888
+ {
1889
+ "epoch": 0.5870675374104954,
1890
+ "grad_norm": 104.4375,
1891
+ "learning_rate": 9.176814417924989e-06,
1892
+ "loss": 24.3678,
1893
+ "step": 1340
1894
+ },
1895
+ {
1896
+ "epoch": 0.5892580879232212,
1897
+ "grad_norm": 77.0625,
1898
+ "learning_rate": 9.128105211885046e-06,
1899
+ "loss": 25.1012,
1900
+ "step": 1345
1901
+ },
1902
+ {
1903
+ "epoch": 0.591448638435947,
1904
+ "grad_norm": 196.375,
1905
+ "learning_rate": 9.079396005845106e-06,
1906
+ "loss": 24.9593,
1907
+ "step": 1350
1908
+ },
1909
+ {
1910
+ "epoch": 0.5936391889486726,
1911
+ "grad_norm": 71.4375,
1912
+ "learning_rate": 9.030686799805163e-06,
1913
+ "loss": 24.8451,
1914
+ "step": 1355
1915
+ },
1916
+ {
1917
+ "epoch": 0.5958297394613984,
1918
+ "grad_norm": 389.5,
1919
+ "learning_rate": 8.981977593765222e-06,
1920
+ "loss": 25.182,
1921
+ "step": 1360
1922
+ },
1923
+ {
1924
+ "epoch": 0.5980202899741242,
1925
+ "grad_norm": 56.03125,
1926
+ "learning_rate": 8.933268387725282e-06,
1927
+ "loss": 24.3735,
1928
+ "step": 1365
1929
+ },
1930
+ {
1931
+ "epoch": 0.6002108404868498,
1932
+ "grad_norm": 55.53125,
1933
+ "learning_rate": 8.884559181685339e-06,
1934
+ "loss": 25.3762,
1935
+ "step": 1370
1936
+ },
1937
+ {
1938
+ "epoch": 0.6024013909995756,
1939
+ "grad_norm": 115.5,
1940
+ "learning_rate": 8.835849975645398e-06,
1941
+ "loss": 23.874,
1942
+ "step": 1375
1943
+ },
1944
+ {
1945
+ "epoch": 0.6045919415123013,
1946
+ "grad_norm": 76.0,
1947
+ "learning_rate": 8.787140769605456e-06,
1948
+ "loss": 25.0251,
1949
+ "step": 1380
1950
+ },
1951
+ {
1952
+ "epoch": 0.606782492025027,
1953
+ "grad_norm": 152.625,
1954
+ "learning_rate": 8.738431563565515e-06,
1955
+ "loss": 25.099,
1956
+ "step": 1385
1957
+ },
1958
+ {
1959
+ "epoch": 0.6089730425377528,
1960
+ "grad_norm": 81.625,
1961
+ "learning_rate": 8.689722357525573e-06,
1962
+ "loss": 24.6503,
1963
+ "step": 1390
1964
+ },
1965
+ {
1966
+ "epoch": 0.6111635930504785,
1967
+ "grad_norm": 99.9375,
1968
+ "learning_rate": 8.641013151485632e-06,
1969
+ "loss": 25.0892,
1970
+ "step": 1395
1971
+ },
1972
+ {
1973
+ "epoch": 0.6133541435632043,
1974
+ "grad_norm": 67.5625,
1975
+ "learning_rate": 8.592303945445691e-06,
1976
+ "loss": 25.1785,
1977
+ "step": 1400
1978
+ },
1979
+ {
1980
+ "epoch": 0.6155446940759299,
1981
+ "grad_norm": 133.125,
1982
+ "learning_rate": 8.543594739405749e-06,
1983
+ "loss": 25.2308,
1984
+ "step": 1405
1985
+ },
1986
+ {
1987
+ "epoch": 0.6177352445886557,
1988
+ "grad_norm": 127.1875,
1989
+ "learning_rate": 8.494885533365806e-06,
1990
+ "loss": 24.7525,
1991
+ "step": 1410
1992
+ },
1993
+ {
1994
+ "epoch": 0.6199257951013815,
1995
+ "grad_norm": 88.4375,
1996
+ "learning_rate": 8.446176327325865e-06,
1997
+ "loss": 25.3764,
1998
+ "step": 1415
1999
+ },
2000
+ {
2001
+ "epoch": 0.6221163456141071,
2002
+ "grad_norm": 68.1875,
2003
+ "learning_rate": 8.397467121285925e-06,
2004
+ "loss": 25.6512,
2005
+ "step": 1420
2006
+ },
2007
+ {
2008
+ "epoch": 0.6243068961268329,
2009
+ "grad_norm": 105.25,
2010
+ "learning_rate": 8.348757915245982e-06,
2011
+ "loss": 24.5618,
2012
+ "step": 1425
2013
+ },
2014
+ {
2015
+ "epoch": 0.6264974466395586,
2016
+ "grad_norm": 62.96875,
2017
+ "learning_rate": 8.300048709206041e-06,
2018
+ "loss": 25.5549,
2019
+ "step": 1430
2020
+ },
2021
+ {
2022
+ "epoch": 0.6286879971522843,
2023
+ "grad_norm": 112.75,
2024
+ "learning_rate": 8.251339503166099e-06,
2025
+ "loss": 25.7603,
2026
+ "step": 1435
2027
+ },
2028
+ {
2029
+ "epoch": 0.6308785476650101,
2030
+ "grad_norm": 146.75,
2031
+ "learning_rate": 8.202630297126158e-06,
2032
+ "loss": 24.9323,
2033
+ "step": 1440
2034
+ },
2035
+ {
2036
+ "epoch": 0.6330690981777358,
2037
+ "grad_norm": 72.8125,
2038
+ "learning_rate": 8.153921091086216e-06,
2039
+ "loss": 25.1751,
2040
+ "step": 1445
2041
+ },
2042
+ {
2043
+ "epoch": 0.6352596486904615,
2044
+ "grad_norm": 131.75,
2045
+ "learning_rate": 8.105211885046275e-06,
2046
+ "loss": 24.7063,
2047
+ "step": 1450
2048
+ },
2049
+ {
2050
+ "epoch": 0.6374501992031872,
2051
+ "grad_norm": 81.5,
2052
+ "learning_rate": 8.056502679006332e-06,
2053
+ "loss": 25.0378,
2054
+ "step": 1455
2055
+ },
2056
+ {
2057
+ "epoch": 0.639640749715913,
2058
+ "grad_norm": 74.0625,
2059
+ "learning_rate": 8.007793472966392e-06,
2060
+ "loss": 24.7719,
2061
+ "step": 1460
2062
+ },
2063
+ {
2064
+ "epoch": 0.6418313002286387,
2065
+ "grad_norm": 65.0,
2066
+ "learning_rate": 7.95908426692645e-06,
2067
+ "loss": 24.439,
2068
+ "step": 1465
2069
+ },
2070
+ {
2071
+ "epoch": 0.6440218507413644,
2072
+ "grad_norm": 66.8125,
2073
+ "learning_rate": 7.910375060886508e-06,
2074
+ "loss": 24.7476,
2075
+ "step": 1470
2076
+ },
2077
+ {
2078
+ "epoch": 0.6462124012540902,
2079
+ "grad_norm": 38.3125,
2080
+ "learning_rate": 7.861665854846566e-06,
2081
+ "loss": 25.3374,
2082
+ "step": 1475
2083
+ },
2084
+ {
2085
+ "epoch": 0.648402951766816,
2086
+ "grad_norm": 55.5625,
2087
+ "learning_rate": 7.812956648806625e-06,
2088
+ "loss": 24.3505,
2089
+ "step": 1480
2090
+ },
2091
+ {
2092
+ "epoch": 0.6505935022795416,
2093
+ "grad_norm": 86.25,
2094
+ "learning_rate": 7.764247442766684e-06,
2095
+ "loss": 24.8686,
2096
+ "step": 1485
2097
+ },
2098
+ {
2099
+ "epoch": 0.6527840527922674,
2100
+ "grad_norm": 78.4375,
2101
+ "learning_rate": 7.715538236726742e-06,
2102
+ "loss": 24.9231,
2103
+ "step": 1490
2104
+ },
2105
+ {
2106
+ "epoch": 0.6549746033049931,
2107
+ "grad_norm": 110.9375,
2108
+ "learning_rate": 7.666829030686801e-06,
2109
+ "loss": 25.0029,
2110
+ "step": 1495
2111
+ },
2112
+ {
2113
+ "epoch": 0.6571651538177188,
2114
+ "grad_norm": 43.84375,
2115
+ "learning_rate": 7.6181198246468595e-06,
2116
+ "loss": 24.4183,
2117
+ "step": 1500
2118
+ },
2119
+ {
2120
+ "epoch": 0.6593557043304445,
2121
+ "grad_norm": 50.46875,
2122
+ "learning_rate": 7.569410618606917e-06,
2123
+ "loss": 24.3607,
2124
+ "step": 1505
2125
+ },
2126
+ {
2127
+ "epoch": 0.6615462548431703,
2128
+ "grad_norm": 220.125,
2129
+ "learning_rate": 7.520701412566975e-06,
2130
+ "loss": 24.7162,
2131
+ "step": 1510
2132
+ },
2133
+ {
2134
+ "epoch": 0.663736805355896,
2135
+ "grad_norm": 89.625,
2136
+ "learning_rate": 7.471992206527035e-06,
2137
+ "loss": 24.8443,
2138
+ "step": 1515
2139
+ },
2140
+ {
2141
+ "epoch": 0.6659273558686217,
2142
+ "grad_norm": 45.96875,
2143
+ "learning_rate": 7.423283000487093e-06,
2144
+ "loss": 24.0884,
2145
+ "step": 1520
2146
+ },
2147
+ {
2148
+ "epoch": 0.6681179063813475,
2149
+ "grad_norm": 41.125,
2150
+ "learning_rate": 7.3745737944471505e-06,
2151
+ "loss": 24.1827,
2152
+ "step": 1525
2153
+ },
2154
+ {
2155
+ "epoch": 0.6703084568940731,
2156
+ "grad_norm": 46.0625,
2157
+ "learning_rate": 7.32586458840721e-06,
2158
+ "loss": 24.6457,
2159
+ "step": 1530
2160
+ },
2161
+ {
2162
+ "epoch": 0.6724990074067989,
2163
+ "grad_norm": 49.96875,
2164
+ "learning_rate": 7.277155382367268e-06,
2165
+ "loss": 24.2197,
2166
+ "step": 1535
2167
+ },
2168
+ {
2169
+ "epoch": 0.6746895579195247,
2170
+ "grad_norm": 55.4375,
2171
+ "learning_rate": 7.2284461763273265e-06,
2172
+ "loss": 24.5083,
2173
+ "step": 1540
2174
+ },
2175
+ {
2176
+ "epoch": 0.6768801084322503,
2177
+ "grad_norm": 147.375,
2178
+ "learning_rate": 7.179736970287386e-06,
2179
+ "loss": 24.5295,
2180
+ "step": 1545
2181
+ },
2182
+ {
2183
+ "epoch": 0.6790706589449761,
2184
+ "grad_norm": 66.8125,
2185
+ "learning_rate": 7.131027764247443e-06,
2186
+ "loss": 24.3484,
2187
+ "step": 1550
2188
+ },
2189
+ {
2190
+ "epoch": 0.6812612094577019,
2191
+ "grad_norm": 99.125,
2192
+ "learning_rate": 7.082318558207502e-06,
2193
+ "loss": 24.5569,
2194
+ "step": 1555
2195
+ },
2196
+ {
2197
+ "epoch": 0.6834517599704276,
2198
+ "grad_norm": 55.34375,
2199
+ "learning_rate": 7.03360935216756e-06,
2200
+ "loss": 24.0178,
2201
+ "step": 1560
2202
+ },
2203
+ {
2204
+ "epoch": 0.6856423104831533,
2205
+ "grad_norm": 89.8125,
2206
+ "learning_rate": 6.984900146127619e-06,
2207
+ "loss": 23.5264,
2208
+ "step": 1565
2209
+ },
2210
+ {
2211
+ "epoch": 0.687832860995879,
2212
+ "grad_norm": 248.5,
2213
+ "learning_rate": 6.936190940087677e-06,
2214
+ "loss": 25.177,
2215
+ "step": 1570
2216
+ },
2217
+ {
2218
+ "epoch": 0.6900234115086048,
2219
+ "grad_norm": 97.75,
2220
+ "learning_rate": 6.887481734047735e-06,
2221
+ "loss": 24.3799,
2222
+ "step": 1575
2223
+ },
2224
+ {
2225
+ "epoch": 0.6922139620213305,
2226
+ "grad_norm": 101.0,
2227
+ "learning_rate": 6.838772528007794e-06,
2228
+ "loss": 24.7024,
2229
+ "step": 1580
2230
+ },
2231
+ {
2232
+ "epoch": 0.6944045125340562,
2233
+ "grad_norm": 94.1875,
2234
+ "learning_rate": 6.790063321967853e-06,
2235
+ "loss": 24.3573,
2236
+ "step": 1585
2237
+ },
2238
+ {
2239
+ "epoch": 0.696595063046782,
2240
+ "grad_norm": 71.0625,
2241
+ "learning_rate": 6.74135411592791e-06,
2242
+ "loss": 24.3899,
2243
+ "step": 1590
2244
+ },
2245
+ {
2246
+ "epoch": 0.6987856135595076,
2247
+ "grad_norm": 62.90625,
2248
+ "learning_rate": 6.6926449098879695e-06,
2249
+ "loss": 24.4977,
2250
+ "step": 1595
2251
+ },
2252
+ {
2253
+ "epoch": 0.7009761640722334,
2254
+ "grad_norm": 42.25,
2255
+ "learning_rate": 6.643935703848028e-06,
2256
+ "loss": 24.0119,
2257
+ "step": 1600
2258
+ },
2259
+ {
2260
+ "epoch": 0.7031667145849592,
2261
+ "grad_norm": 72.75,
2262
+ "learning_rate": 6.595226497808086e-06,
2263
+ "loss": 24.1055,
2264
+ "step": 1605
2265
+ },
2266
+ {
2267
+ "epoch": 0.7053572650976848,
2268
+ "grad_norm": 50.1875,
2269
+ "learning_rate": 6.5465172917681454e-06,
2270
+ "loss": 24.6823,
2271
+ "step": 1610
2272
+ },
2273
+ {
2274
+ "epoch": 0.7075478156104106,
2275
+ "grad_norm": 76.75,
2276
+ "learning_rate": 6.497808085728203e-06,
2277
+ "loss": 24.3863,
2278
+ "step": 1615
2279
+ },
2280
+ {
2281
+ "epoch": 0.7097383661231363,
2282
+ "grad_norm": 41.8125,
2283
+ "learning_rate": 6.449098879688261e-06,
2284
+ "loss": 24.5434,
2285
+ "step": 1620
2286
+ },
2287
+ {
2288
+ "epoch": 0.711928916635862,
2289
+ "grad_norm": 49.71875,
2290
+ "learning_rate": 6.40038967364832e-06,
2291
+ "loss": 23.9173,
2292
+ "step": 1625
2293
+ },
2294
+ {
2295
+ "epoch": 0.7141194671485878,
2296
+ "grad_norm": 303.75,
2297
+ "learning_rate": 6.351680467608379e-06,
2298
+ "loss": 23.8009,
2299
+ "step": 1630
2300
+ },
2301
+ {
2302
+ "epoch": 0.7163100176613135,
2303
+ "grad_norm": 107.0,
2304
+ "learning_rate": 6.3029712615684365e-06,
2305
+ "loss": 24.348,
2306
+ "step": 1635
2307
+ },
2308
+ {
2309
+ "epoch": 0.7185005681740393,
2310
+ "grad_norm": 57.5625,
2311
+ "learning_rate": 6.254262055528495e-06,
2312
+ "loss": 24.7125,
2313
+ "step": 1640
2314
+ },
2315
+ {
2316
+ "epoch": 0.7206911186867649,
2317
+ "grad_norm": 83.125,
2318
+ "learning_rate": 6.205552849488554e-06,
2319
+ "loss": 24.2265,
2320
+ "step": 1645
2321
+ },
2322
+ {
2323
+ "epoch": 0.7228816691994907,
2324
+ "grad_norm": 42.28125,
2325
+ "learning_rate": 6.1568436434486125e-06,
2326
+ "loss": 24.2224,
2327
+ "step": 1650
2328
+ },
2329
+ {
2330
+ "epoch": 0.7250722197122165,
2331
+ "grad_norm": 64.125,
2332
+ "learning_rate": 6.10813443740867e-06,
2333
+ "loss": 24.8299,
2334
+ "step": 1655
2335
+ },
2336
+ {
2337
+ "epoch": 0.7272627702249421,
2338
+ "grad_norm": 105.125,
2339
+ "learning_rate": 6.059425231368729e-06,
2340
+ "loss": 23.6778,
2341
+ "step": 1660
2342
+ },
2343
+ {
2344
+ "epoch": 0.7294533207376679,
2345
+ "grad_norm": 113.6875,
2346
+ "learning_rate": 6.010716025328788e-06,
2347
+ "loss": 24.2767,
2348
+ "step": 1665
2349
+ },
2350
+ {
2351
+ "epoch": 0.7316438712503937,
2352
+ "grad_norm": 67.9375,
2353
+ "learning_rate": 5.962006819288846e-06,
2354
+ "loss": 23.9942,
2355
+ "step": 1670
2356
+ },
2357
+ {
2358
+ "epoch": 0.7338344217631193,
2359
+ "grad_norm": 54.03125,
2360
+ "learning_rate": 5.913297613248905e-06,
2361
+ "loss": 23.7793,
2362
+ "step": 1675
2363
+ },
2364
+ {
2365
+ "epoch": 0.7360249722758451,
2366
+ "grad_norm": 63.625,
2367
+ "learning_rate": 5.8645884072089636e-06,
2368
+ "loss": 23.582,
2369
+ "step": 1680
2370
+ },
2371
+ {
2372
+ "epoch": 0.7382155227885708,
2373
+ "grad_norm": 58.375,
2374
+ "learning_rate": 5.815879201169021e-06,
2375
+ "loss": 24.222,
2376
+ "step": 1685
2377
+ },
2378
+ {
2379
+ "epoch": 0.7404060733012965,
2380
+ "grad_norm": 68.8125,
2381
+ "learning_rate": 5.7671699951290795e-06,
2382
+ "loss": 23.639,
2383
+ "step": 1690
2384
+ },
2385
+ {
2386
+ "epoch": 0.7425966238140222,
2387
+ "grad_norm": 32.53125,
2388
+ "learning_rate": 5.718460789089139e-06,
2389
+ "loss": 23.6367,
2390
+ "step": 1695
2391
+ },
2392
+ {
2393
+ "epoch": 0.744787174326748,
2394
+ "grad_norm": 62.53125,
2395
+ "learning_rate": 5.669751583049197e-06,
2396
+ "loss": 24.6188,
2397
+ "step": 1700
2398
+ },
2399
+ {
2400
+ "epoch": 0.7469777248394737,
2401
+ "grad_norm": 108.75,
2402
+ "learning_rate": 5.621042377009255e-06,
2403
+ "loss": 24.1849,
2404
+ "step": 1705
2405
+ },
2406
+ {
2407
+ "epoch": 0.7491682753521994,
2408
+ "grad_norm": 126.0,
2409
+ "learning_rate": 5.572333170969314e-06,
2410
+ "loss": 24.3651,
2411
+ "step": 1710
2412
+ },
2413
+ {
2414
+ "epoch": 0.7513588258649252,
2415
+ "grad_norm": 50.84375,
2416
+ "learning_rate": 5.523623964929372e-06,
2417
+ "loss": 24.2613,
2418
+ "step": 1715
2419
+ },
2420
+ {
2421
+ "epoch": 0.753549376377651,
2422
+ "grad_norm": 74.5625,
2423
+ "learning_rate": 5.474914758889431e-06,
2424
+ "loss": 23.998,
2425
+ "step": 1720
2426
+ },
2427
+ {
2428
+ "epoch": 0.7557399268903766,
2429
+ "grad_norm": 86.125,
2430
+ "learning_rate": 5.42620555284949e-06,
2431
+ "loss": 23.9456,
2432
+ "step": 1725
2433
+ },
2434
+ {
2435
+ "epoch": 0.7579304774031024,
2436
+ "grad_norm": 61.09375,
2437
+ "learning_rate": 5.377496346809547e-06,
2438
+ "loss": 23.3448,
2439
+ "step": 1730
2440
+ },
2441
+ {
2442
+ "epoch": 0.7601210279158281,
2443
+ "grad_norm": 65.125,
2444
+ "learning_rate": 5.328787140769606e-06,
2445
+ "loss": 24.1856,
2446
+ "step": 1735
2447
+ },
2448
+ {
2449
+ "epoch": 0.7623115784285538,
2450
+ "grad_norm": 54.375,
2451
+ "learning_rate": 5.280077934729664e-06,
2452
+ "loss": 23.7022,
2453
+ "step": 1740
2454
+ },
2455
+ {
2456
+ "epoch": 0.7645021289412796,
2457
+ "grad_norm": 73.8125,
2458
+ "learning_rate": 5.231368728689723e-06,
2459
+ "loss": 24.0991,
2460
+ "step": 1745
2461
+ },
2462
+ {
2463
+ "epoch": 0.7666926794540053,
2464
+ "grad_norm": 41.6875,
2465
+ "learning_rate": 5.182659522649781e-06,
2466
+ "loss": 23.627,
2467
+ "step": 1750
2468
+ },
2469
+ {
2470
+ "epoch": 0.768883229966731,
2471
+ "grad_norm": 95.5625,
2472
+ "learning_rate": 5.133950316609839e-06,
2473
+ "loss": 24.0103,
2474
+ "step": 1755
2475
+ },
2476
+ {
2477
+ "epoch": 0.7710737804794567,
2478
+ "grad_norm": 58.09375,
2479
+ "learning_rate": 5.0852411105698985e-06,
2480
+ "loss": 24.4259,
2481
+ "step": 1760
2482
+ },
2483
+ {
2484
+ "epoch": 0.7732643309921825,
2485
+ "grad_norm": 51.1875,
2486
+ "learning_rate": 5.036531904529957e-06,
2487
+ "loss": 24.3509,
2488
+ "step": 1765
2489
+ },
2490
+ {
2491
+ "epoch": 0.7754548815049082,
2492
+ "grad_norm": 26.265625,
2493
+ "learning_rate": 4.987822698490015e-06,
2494
+ "loss": 24.2639,
2495
+ "step": 1770
2496
+ },
2497
+ {
2498
+ "epoch": 0.7776454320176339,
2499
+ "grad_norm": 37.03125,
2500
+ "learning_rate": 4.939113492450074e-06,
2501
+ "loss": 23.9353,
2502
+ "step": 1775
2503
+ },
2504
+ {
2505
+ "epoch": 0.7798359825303597,
2506
+ "grad_norm": 71.6875,
2507
+ "learning_rate": 4.890404286410132e-06,
2508
+ "loss": 23.7745,
2509
+ "step": 1780
2510
+ },
2511
+ {
2512
+ "epoch": 0.7820265330430853,
2513
+ "grad_norm": 53.875,
2514
+ "learning_rate": 4.84169508037019e-06,
2515
+ "loss": 24.0737,
2516
+ "step": 1785
2517
+ },
2518
+ {
2519
+ "epoch": 0.7842170835558111,
2520
+ "grad_norm": 62.15625,
2521
+ "learning_rate": 4.792985874330249e-06,
2522
+ "loss": 24.3623,
2523
+ "step": 1790
2524
+ },
2525
+ {
2526
+ "epoch": 0.7864076340685369,
2527
+ "grad_norm": 46.3125,
2528
+ "learning_rate": 4.744276668290307e-06,
2529
+ "loss": 24.1168,
2530
+ "step": 1795
2531
+ },
2532
+ {
2533
+ "epoch": 0.7885981845812626,
2534
+ "grad_norm": 49.90625,
2535
+ "learning_rate": 4.6955674622503655e-06,
2536
+ "loss": 23.9208,
2537
+ "step": 1800
2538
+ },
2539
+ {
2540
+ "epoch": 0.7907887350939883,
2541
+ "grad_norm": 70.0625,
2542
+ "learning_rate": 4.646858256210424e-06,
2543
+ "loss": 23.9657,
2544
+ "step": 1805
2545
+ },
2546
+ {
2547
+ "epoch": 0.792979285606714,
2548
+ "grad_norm": 273.5,
2549
+ "learning_rate": 4.598149050170483e-06,
2550
+ "loss": 23.4883,
2551
+ "step": 1810
2552
+ },
2553
+ {
2554
+ "epoch": 0.7951698361194398,
2555
+ "grad_norm": 36.46875,
2556
+ "learning_rate": 4.549439844130541e-06,
2557
+ "loss": 24.1732,
2558
+ "step": 1815
2559
+ },
2560
+ {
2561
+ "epoch": 0.7973603866321655,
2562
+ "grad_norm": 98.1875,
2563
+ "learning_rate": 4.5007306380906e-06,
2564
+ "loss": 24.5668,
2565
+ "step": 1820
2566
+ },
2567
+ {
2568
+ "epoch": 0.7995509371448912,
2569
+ "grad_norm": 79.9375,
2570
+ "learning_rate": 4.452021432050657e-06,
2571
+ "loss": 23.3709,
2572
+ "step": 1825
2573
+ },
2574
+ {
2575
+ "epoch": 0.801741487657617,
2576
+ "grad_norm": 51.5,
2577
+ "learning_rate": 4.403312226010717e-06,
2578
+ "loss": 23.4818,
2579
+ "step": 1830
2580
+ },
2581
+ {
2582
+ "epoch": 0.8039320381703426,
2583
+ "grad_norm": 72.25,
2584
+ "learning_rate": 4.354603019970775e-06,
2585
+ "loss": 23.678,
2586
+ "step": 1835
2587
+ },
2588
+ {
2589
+ "epoch": 0.8061225886830684,
2590
+ "grad_norm": 56.21875,
2591
+ "learning_rate": 4.305893813930833e-06,
2592
+ "loss": 23.3332,
2593
+ "step": 1840
2594
+ },
2595
+ {
2596
+ "epoch": 0.8083131391957942,
2597
+ "grad_norm": 179.25,
2598
+ "learning_rate": 4.257184607890892e-06,
2599
+ "loss": 23.4972,
2600
+ "step": 1845
2601
+ },
2602
+ {
2603
+ "epoch": 0.8105036897085198,
2604
+ "grad_norm": 105.875,
2605
+ "learning_rate": 4.20847540185095e-06,
2606
+ "loss": 23.0243,
2607
+ "step": 1850
2608
+ },
2609
+ {
2610
+ "epoch": 0.8126942402212456,
2611
+ "grad_norm": 49.53125,
2612
+ "learning_rate": 4.1597661958110085e-06,
2613
+ "loss": 23.926,
2614
+ "step": 1855
2615
+ },
2616
+ {
2617
+ "epoch": 0.8148847907339714,
2618
+ "grad_norm": 42.1875,
2619
+ "learning_rate": 4.111056989771067e-06,
2620
+ "loss": 24.0595,
2621
+ "step": 1860
2622
+ },
2623
+ {
2624
+ "epoch": 0.817075341246697,
2625
+ "grad_norm": 38.0625,
2626
+ "learning_rate": 4.062347783731125e-06,
2627
+ "loss": 23.3749,
2628
+ "step": 1865
2629
+ },
2630
+ {
2631
+ "epoch": 0.8192658917594228,
2632
+ "grad_norm": 46.21875,
2633
+ "learning_rate": 4.0136385776911845e-06,
2634
+ "loss": 23.5141,
2635
+ "step": 1870
2636
+ },
2637
+ {
2638
+ "epoch": 0.8214564422721485,
2639
+ "grad_norm": 64.6875,
2640
+ "learning_rate": 3.964929371651242e-06,
2641
+ "loss": 22.9993,
2642
+ "step": 1875
2643
+ },
2644
+ {
2645
+ "epoch": 0.8236469927848743,
2646
+ "grad_norm": 104.1875,
2647
+ "learning_rate": 3.916220165611301e-06,
2648
+ "loss": 23.1683,
2649
+ "step": 1880
2650
+ },
2651
+ {
2652
+ "epoch": 0.8258375432976,
2653
+ "grad_norm": 58.84375,
2654
+ "learning_rate": 3.86751095957136e-06,
2655
+ "loss": 23.006,
2656
+ "step": 1885
2657
+ },
2658
+ {
2659
+ "epoch": 0.8280280938103257,
2660
+ "grad_norm": 47.625,
2661
+ "learning_rate": 3.818801753531418e-06,
2662
+ "loss": 23.3635,
2663
+ "step": 1890
2664
+ },
2665
+ {
2666
+ "epoch": 0.8302186443230515,
2667
+ "grad_norm": 37.03125,
2668
+ "learning_rate": 3.7700925474914763e-06,
2669
+ "loss": 23.9395,
2670
+ "step": 1895
2671
+ },
2672
+ {
2673
+ "epoch": 0.8324091948357771,
2674
+ "grad_norm": 40.0625,
2675
+ "learning_rate": 3.7213833414515347e-06,
2676
+ "loss": 23.6212,
2677
+ "step": 1900
2678
+ },
2679
+ {
2680
+ "epoch": 0.8345997453485029,
2681
+ "grad_norm": 144.875,
2682
+ "learning_rate": 3.672674135411593e-06,
2683
+ "loss": 23.753,
2684
+ "step": 1905
2685
+ },
2686
+ {
2687
+ "epoch": 0.8367902958612287,
2688
+ "grad_norm": 34.53125,
2689
+ "learning_rate": 3.6239649293716515e-06,
2690
+ "loss": 22.3497,
2691
+ "step": 1910
2692
+ },
2693
+ {
2694
+ "epoch": 0.8389808463739543,
2695
+ "grad_norm": 44.71875,
2696
+ "learning_rate": 3.57525572333171e-06,
2697
+ "loss": 22.9936,
2698
+ "step": 1915
2699
+ },
2700
+ {
2701
+ "epoch": 0.8411713968866801,
2702
+ "grad_norm": 42.5625,
2703
+ "learning_rate": 3.5265465172917687e-06,
2704
+ "loss": 23.6379,
2705
+ "step": 1920
2706
+ },
2707
+ {
2708
+ "epoch": 0.8433619473994058,
2709
+ "grad_norm": 77.8125,
2710
+ "learning_rate": 3.477837311251827e-06,
2711
+ "loss": 23.9546,
2712
+ "step": 1925
2713
+ },
2714
+ {
2715
+ "epoch": 0.8455524979121315,
2716
+ "grad_norm": 149.375,
2717
+ "learning_rate": 3.4291281052118854e-06,
2718
+ "loss": 23.1372,
2719
+ "step": 1930
2720
+ },
2721
+ {
2722
+ "epoch": 0.8477430484248573,
2723
+ "grad_norm": 105.0625,
2724
+ "learning_rate": 3.3804188991719438e-06,
2725
+ "loss": 23.1796,
2726
+ "step": 1935
2727
+ },
2728
+ {
2729
+ "epoch": 0.849933598937583,
2730
+ "grad_norm": 27.484375,
2731
+ "learning_rate": 3.331709693132002e-06,
2732
+ "loss": 23.3445,
2733
+ "step": 1940
2734
+ },
2735
+ {
2736
+ "epoch": 0.8521241494503087,
2737
+ "grad_norm": 50.78125,
2738
+ "learning_rate": 3.2830004870920605e-06,
2739
+ "loss": 23.4366,
2740
+ "step": 1945
2741
+ },
2742
+ {
2743
+ "epoch": 0.8543146999630344,
2744
+ "grad_norm": 33.125,
2745
+ "learning_rate": 3.2342912810521193e-06,
2746
+ "loss": 23.522,
2747
+ "step": 1950
2748
+ },
2749
+ {
2750
+ "epoch": 0.8565052504757602,
2751
+ "grad_norm": 36.71875,
2752
+ "learning_rate": 3.1855820750121773e-06,
2753
+ "loss": 23.6913,
2754
+ "step": 1955
2755
+ },
2756
+ {
2757
+ "epoch": 0.858695800988486,
2758
+ "grad_norm": 48.59375,
2759
+ "learning_rate": 3.136872868972236e-06,
2760
+ "loss": 23.2535,
2761
+ "step": 1960
2762
+ },
2763
+ {
2764
+ "epoch": 0.8608863515012116,
2765
+ "grad_norm": 90.1875,
2766
+ "learning_rate": 3.088163662932294e-06,
2767
+ "loss": 23.2659,
2768
+ "step": 1965
2769
+ },
2770
+ {
2771
+ "epoch": 0.8630769020139374,
2772
+ "grad_norm": 49.5625,
2773
+ "learning_rate": 3.039454456892353e-06,
2774
+ "loss": 22.8549,
2775
+ "step": 1970
2776
+ },
2777
+ {
2778
+ "epoch": 0.8652674525266632,
2779
+ "grad_norm": 134.625,
2780
+ "learning_rate": 2.9907452508524117e-06,
2781
+ "loss": 23.9329,
2782
+ "step": 1975
2783
+ },
2784
+ {
2785
+ "epoch": 0.8674580030393888,
2786
+ "grad_norm": 43.875,
2787
+ "learning_rate": 2.9420360448124696e-06,
2788
+ "loss": 22.7826,
2789
+ "step": 1980
2790
+ },
2791
+ {
2792
+ "epoch": 0.8696485535521146,
2793
+ "grad_norm": 31.65625,
2794
+ "learning_rate": 2.8933268387725284e-06,
2795
+ "loss": 23.0787,
2796
+ "step": 1985
2797
+ },
2798
+ {
2799
+ "epoch": 0.8718391040648403,
2800
+ "grad_norm": 56.9375,
2801
+ "learning_rate": 2.8446176327325868e-06,
2802
+ "loss": 23.0323,
2803
+ "step": 1990
2804
+ },
2805
+ {
2806
+ "epoch": 0.874029654577566,
2807
+ "grad_norm": 39.78125,
2808
+ "learning_rate": 2.795908426692645e-06,
2809
+ "loss": 23.3068,
2810
+ "step": 1995
2811
+ },
2812
+ {
2813
+ "epoch": 0.8762202050902917,
2814
+ "grad_norm": 48.09375,
2815
+ "learning_rate": 2.7471992206527035e-06,
2816
+ "loss": 23.4063,
2817
+ "step": 2000
2818
+ },
2819
+ {
2820
+ "epoch": 0.8762202050902917,
2821
+ "eval_loss": NaN,
2822
+ "eval_runtime": 243.6077,
2823
+ "eval_samples_per_second": 1009.956,
2824
+ "eval_steps_per_second": 31.563,
2825
+ "step": 2000
2826
  }
2827
  ],
2828
  "logging_steps": 5,
 
2842
  "attributes": {}
2843
  }
2844
  },
2845
+ "total_flos": 1.732943025316823e+19,
2846
  "train_batch_size": 8,
2847
  "trial_name": null,
2848
  "trial_params": null