error577 commited on
Commit
33aa6cc
·
verified ·
1 Parent(s): 3e1353a

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b40b4ea852faa0dabd88f7ddbc8095331dcaf982cf3b34cc7272211b022508
3
  size 859942080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b915467464f8b803b33ca073fb0597dc5f8e94a3a3d27e6062c9ed0c6919b583
3
  size 859942080
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6973271476cd76cc32089427e31b042ab3cf370ff220d960bc069ac0e8b7e1d7
3
  size 90187222
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d109f981c5c1df0d534dcc8ba8b77ece29c425af0989248f8edec0257953d1e
3
  size 90187222
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:092d05c850cba14a4e1067d9540d36872c05fc71b3eeadb4562ba802384222c9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d25e1211c9046b57a33acc5132889e25842980cf549f2f259dad9359efcc4211
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e75c704c71c99deef040fe407c6f53cc8d33f4439273d19c1681b1ebdfb69672
3
  size 2080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f23c15d28ca8e830a4a7dce196129a0e658718ff1073df92eaf612f2c3f31ab6
3
  size 2080
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.802311658859253,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.0014563354231746657,
5
  "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1431,6 +1431,714 @@
1431
  "eval_samples_per_second": 5.702,
1432
  "eval_steps_per_second": 1.901,
1433
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
  }
1435
  ],
1436
  "logging_steps": 1,
@@ -1445,7 +2153,7 @@
1445
  "early_stopping_threshold": 0.0
1446
  },
1447
  "attributes": {
1448
- "early_stopping_patience_counter": 1
1449
  }
1450
  },
1451
  "TrainerControl": {
@@ -1459,7 +2167,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 1.32610101608448e+16,
1463
  "train_batch_size": 3,
1464
  "trial_name": null,
1465
  "trial_params": null
 
1
  {
2
  "best_metric": 1.802311658859253,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.0021845031347619984,
5
  "eval_steps": 100,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1431
  "eval_samples_per_second": 5.702,
1432
  "eval_steps_per_second": 1.901,
1433
  "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.0014636171002905389,
1437
+ "grad_norm": 16.332197189331055,
1438
+ "learning_rate": 0.00019999990763608366,
1439
+ "loss": 1.9815,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.0014708987774064122,
1444
+ "grad_norm": 14.076228141784668,
1445
+ "learning_rate": 0.00019999990763608366,
1446
+ "loss": 2.2106,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.0014781804545222856,
1451
+ "grad_norm": 13.295379638671875,
1452
+ "learning_rate": 0.00019999990763608366,
1453
+ "loss": 1.9168,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.0014854621316381588,
1458
+ "grad_norm": 12.049559593200684,
1459
+ "learning_rate": 0.00019999990763608366,
1460
+ "loss": 1.9751,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.0014927438087540322,
1465
+ "grad_norm": 9.581024169921875,
1466
+ "learning_rate": 0.00019999990763608366,
1467
+ "loss": 1.6029,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.0015000254858699056,
1472
+ "grad_norm": 21.94390296936035,
1473
+ "learning_rate": 0.00019999989308416843,
1474
+ "loss": 2.8107,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.0015073071629857788,
1479
+ "grad_norm": 13.23304271697998,
1480
+ "learning_rate": 0.00019999989308416843,
1481
+ "loss": 2.3767,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.0015145888401016522,
1486
+ "grad_norm": 8.7070951461792,
1487
+ "learning_rate": 0.00019999989308416843,
1488
+ "loss": 2.2435,
1489
+ "step": 208
1490
+ },
1491
+ {
1492
+ "epoch": 0.0015218705172175256,
1493
+ "grad_norm": 14.414718627929688,
1494
+ "learning_rate": 0.00019999989308416843,
1495
+ "loss": 2.2377,
1496
+ "step": 209
1497
+ },
1498
+ {
1499
+ "epoch": 0.001529152194333399,
1500
+ "grad_norm": 7.783048629760742,
1501
+ "learning_rate": 0.00019999989308416843,
1502
+ "loss": 1.6258,
1503
+ "step": 210
1504
+ },
1505
+ {
1506
+ "epoch": 0.0015364338714492721,
1507
+ "grad_norm": 7.3645429611206055,
1508
+ "learning_rate": 0.00019999989308416843,
1509
+ "loss": 2.0517,
1510
+ "step": 211
1511
+ },
1512
+ {
1513
+ "epoch": 0.0015437155485651455,
1514
+ "grad_norm": 9.363513946533203,
1515
+ "learning_rate": 0.00019999989308416843,
1516
+ "loss": 2.0216,
1517
+ "step": 212
1518
+ },
1519
+ {
1520
+ "epoch": 0.001550997225681019,
1521
+ "grad_norm": 12.244095802307129,
1522
+ "learning_rate": 0.00019999989308416843,
1523
+ "loss": 1.9019,
1524
+ "step": 213
1525
+ },
1526
+ {
1527
+ "epoch": 0.0015582789027968921,
1528
+ "grad_norm": 13.650781631469727,
1529
+ "learning_rate": 0.00019999989308416843,
1530
+ "loss": 1.7148,
1531
+ "step": 214
1532
+ },
1533
+ {
1534
+ "epoch": 0.0015655605799127655,
1535
+ "grad_norm": 10.877537727355957,
1536
+ "learning_rate": 0.00019999989308416843,
1537
+ "loss": 1.5228,
1538
+ "step": 215
1539
+ },
1540
+ {
1541
+ "epoch": 0.001572842257028639,
1542
+ "grad_norm": 11.581880569458008,
1543
+ "learning_rate": 0.00019999989308416843,
1544
+ "loss": 2.0176,
1545
+ "step": 216
1546
+ },
1547
+ {
1548
+ "epoch": 0.001580123934144512,
1549
+ "grad_norm": 10.073941230773926,
1550
+ "learning_rate": 0.00019999989308416843,
1551
+ "loss": 1.5565,
1552
+ "step": 217
1553
+ },
1554
+ {
1555
+ "epoch": 0.0015874056112603855,
1556
+ "grad_norm": 8.537222862243652,
1557
+ "learning_rate": 0.00019999989308416843,
1558
+ "loss": 1.329,
1559
+ "step": 218
1560
+ },
1561
+ {
1562
+ "epoch": 0.0015946872883762589,
1563
+ "grad_norm": 14.164959907531738,
1564
+ "learning_rate": 0.00019999989308416843,
1565
+ "loss": 1.663,
1566
+ "step": 219
1567
+ },
1568
+ {
1569
+ "epoch": 0.001601968965492132,
1570
+ "grad_norm": 10.332074165344238,
1571
+ "learning_rate": 0.00019999989308416843,
1572
+ "loss": 1.6474,
1573
+ "step": 220
1574
+ },
1575
+ {
1576
+ "epoch": 0.0016092506426080054,
1577
+ "grad_norm": 18.904674530029297,
1578
+ "learning_rate": 0.00019999989308416843,
1579
+ "loss": 3.0191,
1580
+ "step": 221
1581
+ },
1582
+ {
1583
+ "epoch": 0.0016165323197238788,
1584
+ "grad_norm": 23.476869583129883,
1585
+ "learning_rate": 0.00019999989308416843,
1586
+ "loss": 2.5204,
1587
+ "step": 222
1588
+ },
1589
+ {
1590
+ "epoch": 0.0016238139968397522,
1591
+ "grad_norm": 10.671902656555176,
1592
+ "learning_rate": 0.00019999989308416843,
1593
+ "loss": 2.2773,
1594
+ "step": 223
1595
+ },
1596
+ {
1597
+ "epoch": 0.0016310956739556254,
1598
+ "grad_norm": 11.046639442443848,
1599
+ "learning_rate": 0.00019999989308416843,
1600
+ "loss": 1.6691,
1601
+ "step": 224
1602
+ },
1603
+ {
1604
+ "epoch": 0.0016383773510714988,
1605
+ "grad_norm": 15.235762596130371,
1606
+ "learning_rate": 0.00019999989308416843,
1607
+ "loss": 1.9877,
1608
+ "step": 225
1609
+ },
1610
+ {
1611
+ "epoch": 0.0016456590281873722,
1612
+ "grad_norm": 11.436957359313965,
1613
+ "learning_rate": 0.00019999989308416843,
1614
+ "loss": 1.2688,
1615
+ "step": 226
1616
+ },
1617
+ {
1618
+ "epoch": 0.0016529407053032454,
1619
+ "grad_norm": 11.368927001953125,
1620
+ "learning_rate": 0.00019999989308416843,
1621
+ "loss": 1.7685,
1622
+ "step": 227
1623
+ },
1624
+ {
1625
+ "epoch": 0.0016602223824191188,
1626
+ "grad_norm": 8.242121696472168,
1627
+ "learning_rate": 0.0001999998785322532,
1628
+ "loss": 1.6697,
1629
+ "step": 228
1630
+ },
1631
+ {
1632
+ "epoch": 0.0016675040595349922,
1633
+ "grad_norm": 5.462067127227783,
1634
+ "learning_rate": 0.0001999998785322532,
1635
+ "loss": 1.1327,
1636
+ "step": 229
1637
+ },
1638
+ {
1639
+ "epoch": 0.0016747857366508653,
1640
+ "grad_norm": 10.926278114318848,
1641
+ "learning_rate": 0.0001999998785322532,
1642
+ "loss": 1.9485,
1643
+ "step": 230
1644
+ },
1645
+ {
1646
+ "epoch": 0.0016820674137667387,
1647
+ "grad_norm": 7.242750644683838,
1648
+ "learning_rate": 0.0001999998785322532,
1649
+ "loss": 1.4423,
1650
+ "step": 231
1651
+ },
1652
+ {
1653
+ "epoch": 0.0016893490908826121,
1654
+ "grad_norm": 11.249567985534668,
1655
+ "learning_rate": 0.0001999998785322532,
1656
+ "loss": 2.0181,
1657
+ "step": 232
1658
+ },
1659
+ {
1660
+ "epoch": 0.0016966307679984853,
1661
+ "grad_norm": 11.364212036132812,
1662
+ "learning_rate": 0.0001999998785322532,
1663
+ "loss": 2.0749,
1664
+ "step": 233
1665
+ },
1666
+ {
1667
+ "epoch": 0.0017039124451143587,
1668
+ "grad_norm": 10.349282264709473,
1669
+ "learning_rate": 0.0001999998785322532,
1670
+ "loss": 1.6206,
1671
+ "step": 234
1672
+ },
1673
+ {
1674
+ "epoch": 0.001711194122230232,
1675
+ "grad_norm": 12.662111282348633,
1676
+ "learning_rate": 0.0001999998785322532,
1677
+ "loss": 2.0099,
1678
+ "step": 235
1679
+ },
1680
+ {
1681
+ "epoch": 0.0017184757993461055,
1682
+ "grad_norm": 7.831268787384033,
1683
+ "learning_rate": 0.0001999998785322532,
1684
+ "loss": 1.551,
1685
+ "step": 236
1686
+ },
1687
+ {
1688
+ "epoch": 0.0017257574764619787,
1689
+ "grad_norm": 11.112309455871582,
1690
+ "learning_rate": 0.0001999998785322532,
1691
+ "loss": 2.1506,
1692
+ "step": 237
1693
+ },
1694
+ {
1695
+ "epoch": 0.001733039153577852,
1696
+ "grad_norm": 11.007096290588379,
1697
+ "learning_rate": 0.00019999986398033798,
1698
+ "loss": 1.8578,
1699
+ "step": 238
1700
+ },
1701
+ {
1702
+ "epoch": 0.0017403208306937255,
1703
+ "grad_norm": 8.862707138061523,
1704
+ "learning_rate": 0.00019999986398033798,
1705
+ "loss": 1.2392,
1706
+ "step": 239
1707
+ },
1708
+ {
1709
+ "epoch": 0.0017476025078095986,
1710
+ "grad_norm": 6.822959899902344,
1711
+ "learning_rate": 0.00019999986398033798,
1712
+ "loss": 1.26,
1713
+ "step": 240
1714
+ },
1715
+ {
1716
+ "epoch": 0.001754884184925472,
1717
+ "grad_norm": 9.791293144226074,
1718
+ "learning_rate": 0.00019999986398033798,
1719
+ "loss": 1.5111,
1720
+ "step": 241
1721
+ },
1722
+ {
1723
+ "epoch": 0.0017621658620413454,
1724
+ "grad_norm": 14.250927925109863,
1725
+ "learning_rate": 0.00019999986398033798,
1726
+ "loss": 2.5111,
1727
+ "step": 242
1728
+ },
1729
+ {
1730
+ "epoch": 0.0017694475391572186,
1731
+ "grad_norm": 18.571813583374023,
1732
+ "learning_rate": 0.00019999986398033798,
1733
+ "loss": 2.0137,
1734
+ "step": 243
1735
+ },
1736
+ {
1737
+ "epoch": 0.001776729216273092,
1738
+ "grad_norm": 14.961633682250977,
1739
+ "learning_rate": 0.00019999986398033798,
1740
+ "loss": 2.6579,
1741
+ "step": 244
1742
+ },
1743
+ {
1744
+ "epoch": 0.0017840108933889654,
1745
+ "grad_norm": 12.702574729919434,
1746
+ "learning_rate": 0.00019999986398033798,
1747
+ "loss": 2.5643,
1748
+ "step": 245
1749
+ },
1750
+ {
1751
+ "epoch": 0.0017912925705048386,
1752
+ "grad_norm": 11.677947044372559,
1753
+ "learning_rate": 0.00019999986398033798,
1754
+ "loss": 2.6246,
1755
+ "step": 246
1756
+ },
1757
+ {
1758
+ "epoch": 0.001798574247620712,
1759
+ "grad_norm": 13.013418197631836,
1760
+ "learning_rate": 0.00019999986398033798,
1761
+ "loss": 2.0649,
1762
+ "step": 247
1763
+ },
1764
+ {
1765
+ "epoch": 0.0018058559247365854,
1766
+ "grad_norm": 13.50846004486084,
1767
+ "learning_rate": 0.00019999984942842275,
1768
+ "loss": 2.4352,
1769
+ "step": 248
1770
+ },
1771
+ {
1772
+ "epoch": 0.0018131376018524588,
1773
+ "grad_norm": 9.18942928314209,
1774
+ "learning_rate": 0.00019999984942842275,
1775
+ "loss": 2.0352,
1776
+ "step": 249
1777
+ },
1778
+ {
1779
+ "epoch": 0.001820419278968332,
1780
+ "grad_norm": 8.032073020935059,
1781
+ "learning_rate": 0.00019999984942842275,
1782
+ "loss": 1.8027,
1783
+ "step": 250
1784
+ },
1785
+ {
1786
+ "epoch": 0.0018277009560842053,
1787
+ "grad_norm": 10.291788101196289,
1788
+ "learning_rate": 0.00019999984942842275,
1789
+ "loss": 1.541,
1790
+ "step": 251
1791
+ },
1792
+ {
1793
+ "epoch": 0.0018349826332000787,
1794
+ "grad_norm": 9.52447509765625,
1795
+ "learning_rate": 0.00019999984942842275,
1796
+ "loss": 1.5761,
1797
+ "step": 252
1798
+ },
1799
+ {
1800
+ "epoch": 0.001842264310315952,
1801
+ "grad_norm": 11.109513282775879,
1802
+ "learning_rate": 0.00019999984942842275,
1803
+ "loss": 1.9252,
1804
+ "step": 253
1805
+ },
1806
+ {
1807
+ "epoch": 0.0018495459874318253,
1808
+ "grad_norm": 9.843435287475586,
1809
+ "learning_rate": 0.00019999984942842275,
1810
+ "loss": 1.5539,
1811
+ "step": 254
1812
+ },
1813
+ {
1814
+ "epoch": 0.0018568276645476987,
1815
+ "grad_norm": 10.07934284210205,
1816
+ "learning_rate": 0.00019999984942842275,
1817
+ "loss": 1.6413,
1818
+ "step": 255
1819
+ },
1820
+ {
1821
+ "epoch": 0.0018641093416635719,
1822
+ "grad_norm": 11.096989631652832,
1823
+ "learning_rate": 0.00019999984942842275,
1824
+ "loss": 1.8902,
1825
+ "step": 256
1826
+ },
1827
+ {
1828
+ "epoch": 0.0018713910187794453,
1829
+ "grad_norm": 9.845468521118164,
1830
+ "learning_rate": 0.00019999983487650752,
1831
+ "loss": 1.6295,
1832
+ "step": 257
1833
+ },
1834
+ {
1835
+ "epoch": 0.0018786726958953187,
1836
+ "grad_norm": 11.693940162658691,
1837
+ "learning_rate": 0.00019999983487650752,
1838
+ "loss": 1.6928,
1839
+ "step": 258
1840
+ },
1841
+ {
1842
+ "epoch": 0.001885954373011192,
1843
+ "grad_norm": 11.13638687133789,
1844
+ "learning_rate": 0.00019999983487650752,
1845
+ "loss": 1.7201,
1846
+ "step": 259
1847
+ },
1848
+ {
1849
+ "epoch": 0.0018932360501270652,
1850
+ "grad_norm": 29.184389114379883,
1851
+ "learning_rate": 0.00019999983487650752,
1852
+ "loss": 1.0939,
1853
+ "step": 260
1854
+ },
1855
+ {
1856
+ "epoch": 0.0019005177272429386,
1857
+ "grad_norm": 13.516611099243164,
1858
+ "learning_rate": 0.00019999983487650752,
1859
+ "loss": 1.3989,
1860
+ "step": 261
1861
+ },
1862
+ {
1863
+ "epoch": 0.001907799404358812,
1864
+ "grad_norm": 15.792509078979492,
1865
+ "learning_rate": 0.00019999983487650752,
1866
+ "loss": 2.2967,
1867
+ "step": 262
1868
+ },
1869
+ {
1870
+ "epoch": 0.0019150810814746852,
1871
+ "grad_norm": 10.118632316589355,
1872
+ "learning_rate": 0.00019999983487650752,
1873
+ "loss": 1.816,
1874
+ "step": 263
1875
+ },
1876
+ {
1877
+ "epoch": 0.0019223627585905586,
1878
+ "grad_norm": 10.63323974609375,
1879
+ "learning_rate": 0.00019999983487650752,
1880
+ "loss": 1.622,
1881
+ "step": 264
1882
+ },
1883
+ {
1884
+ "epoch": 0.001929644435706432,
1885
+ "grad_norm": 14.89815616607666,
1886
+ "learning_rate": 0.00019999983487650752,
1887
+ "loss": 2.2951,
1888
+ "step": 265
1889
+ },
1890
+ {
1891
+ "epoch": 0.0019369261128223052,
1892
+ "grad_norm": 18.71169662475586,
1893
+ "learning_rate": 0.00019999983487650752,
1894
+ "loss": 2.7834,
1895
+ "step": 266
1896
+ },
1897
+ {
1898
+ "epoch": 0.0019442077899381786,
1899
+ "grad_norm": 15.97852897644043,
1900
+ "learning_rate": 0.00019999983487650752,
1901
+ "loss": 1.9917,
1902
+ "step": 267
1903
+ },
1904
+ {
1905
+ "epoch": 0.001951489467054052,
1906
+ "grad_norm": 16.946123123168945,
1907
+ "learning_rate": 0.00019999983487650752,
1908
+ "loss": 2.1462,
1909
+ "step": 268
1910
+ },
1911
+ {
1912
+ "epoch": 0.0019587711441699253,
1913
+ "grad_norm": 13.42167854309082,
1914
+ "learning_rate": 0.00019999983487650752,
1915
+ "loss": 1.6697,
1916
+ "step": 269
1917
+ },
1918
+ {
1919
+ "epoch": 0.0019660528212857987,
1920
+ "grad_norm": 18.283449172973633,
1921
+ "learning_rate": 0.00019999983487650752,
1922
+ "loss": 2.2946,
1923
+ "step": 270
1924
+ },
1925
+ {
1926
+ "epoch": 0.0019733344984016717,
1927
+ "grad_norm": 13.643292427062988,
1928
+ "learning_rate": 0.00019999983487650752,
1929
+ "loss": 1.3446,
1930
+ "step": 271
1931
+ },
1932
+ {
1933
+ "epoch": 0.001980616175517545,
1934
+ "grad_norm": 24.068721771240234,
1935
+ "learning_rate": 0.00019999983487650752,
1936
+ "loss": 2.3003,
1937
+ "step": 272
1938
+ },
1939
+ {
1940
+ "epoch": 0.0019878978526334185,
1941
+ "grad_norm": 8.943958282470703,
1942
+ "learning_rate": 0.00019999983487650752,
1943
+ "loss": 1.4237,
1944
+ "step": 273
1945
+ },
1946
+ {
1947
+ "epoch": 0.001995179529749292,
1948
+ "grad_norm": 10.839051246643066,
1949
+ "learning_rate": 0.0001999998203245923,
1950
+ "loss": 2.3502,
1951
+ "step": 274
1952
+ },
1953
+ {
1954
+ "epoch": 0.0020024612068651653,
1955
+ "grad_norm": 13.242242813110352,
1956
+ "learning_rate": 0.0001999998203245923,
1957
+ "loss": 2.7399,
1958
+ "step": 275
1959
+ },
1960
+ {
1961
+ "epoch": 0.0020097428839810387,
1962
+ "grad_norm": 12.316596984863281,
1963
+ "learning_rate": 0.0001999998203245923,
1964
+ "loss": 1.4512,
1965
+ "step": 276
1966
+ },
1967
+ {
1968
+ "epoch": 0.0020170245610969116,
1969
+ "grad_norm": 12.517656326293945,
1970
+ "learning_rate": 0.0001999998203245923,
1971
+ "loss": 2.4683,
1972
+ "step": 277
1973
+ },
1974
+ {
1975
+ "epoch": 0.002024306238212785,
1976
+ "grad_norm": 9.340202331542969,
1977
+ "learning_rate": 0.0001999998203245923,
1978
+ "loss": 1.818,
1979
+ "step": 278
1980
+ },
1981
+ {
1982
+ "epoch": 0.0020315879153286584,
1983
+ "grad_norm": 9.712027549743652,
1984
+ "learning_rate": 0.0001999998203245923,
1985
+ "loss": 1.6762,
1986
+ "step": 279
1987
+ },
1988
+ {
1989
+ "epoch": 0.002038869592444532,
1990
+ "grad_norm": 7.545422077178955,
1991
+ "learning_rate": 0.0001999998203245923,
1992
+ "loss": 1.223,
1993
+ "step": 280
1994
+ },
1995
+ {
1996
+ "epoch": 0.002046151269560405,
1997
+ "grad_norm": 7.926312446594238,
1998
+ "learning_rate": 0.0001999998203245923,
1999
+ "loss": 1.4197,
2000
+ "step": 281
2001
+ },
2002
+ {
2003
+ "epoch": 0.0020534329466762786,
2004
+ "grad_norm": 10.182934761047363,
2005
+ "learning_rate": 0.0001999998203245923,
2006
+ "loss": 1.8908,
2007
+ "step": 282
2008
+ },
2009
+ {
2010
+ "epoch": 0.002060714623792152,
2011
+ "grad_norm": 13.26349925994873,
2012
+ "learning_rate": 0.00019999980577267706,
2013
+ "loss": 1.686,
2014
+ "step": 283
2015
+ },
2016
+ {
2017
+ "epoch": 0.002067996300908025,
2018
+ "grad_norm": 9.935827255249023,
2019
+ "learning_rate": 0.00019999980577267706,
2020
+ "loss": 1.2595,
2021
+ "step": 284
2022
+ },
2023
+ {
2024
+ "epoch": 0.0020752779780238984,
2025
+ "grad_norm": 10.127717018127441,
2026
+ "learning_rate": 0.00019999980577267706,
2027
+ "loss": 1.0804,
2028
+ "step": 285
2029
+ },
2030
+ {
2031
+ "epoch": 0.0020825596551397717,
2032
+ "grad_norm": 9.184852600097656,
2033
+ "learning_rate": 0.00019999980577267706,
2034
+ "loss": 1.1123,
2035
+ "step": 286
2036
+ },
2037
+ {
2038
+ "epoch": 0.002089841332255645,
2039
+ "grad_norm": 13.199743270874023,
2040
+ "learning_rate": 0.00019999980577267706,
2041
+ "loss": 2.1116,
2042
+ "step": 287
2043
+ },
2044
+ {
2045
+ "epoch": 0.0020971230093715185,
2046
+ "grad_norm": 9.43826961517334,
2047
+ "learning_rate": 0.00019999980577267706,
2048
+ "loss": 2.0209,
2049
+ "step": 288
2050
+ },
2051
+ {
2052
+ "epoch": 0.002104404686487392,
2053
+ "grad_norm": 10.829771041870117,
2054
+ "learning_rate": 0.00019999980577267706,
2055
+ "loss": 1.9645,
2056
+ "step": 289
2057
+ },
2058
+ {
2059
+ "epoch": 0.002111686363603265,
2060
+ "grad_norm": 9.616719245910645,
2061
+ "learning_rate": 0.00019999980577267706,
2062
+ "loss": 1.8015,
2063
+ "step": 290
2064
+ },
2065
+ {
2066
+ "epoch": 0.0021189680407191383,
2067
+ "grad_norm": 6.407859802246094,
2068
+ "learning_rate": 0.00019999979122076184,
2069
+ "loss": 0.7276,
2070
+ "step": 291
2071
+ },
2072
+ {
2073
+ "epoch": 0.0021262497178350117,
2074
+ "grad_norm": 14.593647956848145,
2075
+ "learning_rate": 0.00019999979122076184,
2076
+ "loss": 2.1155,
2077
+ "step": 292
2078
+ },
2079
+ {
2080
+ "epoch": 0.002133531394950885,
2081
+ "grad_norm": 12.054818153381348,
2082
+ "learning_rate": 0.00019999979122076184,
2083
+ "loss": 2.4701,
2084
+ "step": 293
2085
+ },
2086
+ {
2087
+ "epoch": 0.0021408130720667585,
2088
+ "grad_norm": 11.654630661010742,
2089
+ "learning_rate": 0.00019999979122076184,
2090
+ "loss": 2.0063,
2091
+ "step": 294
2092
+ },
2093
+ {
2094
+ "epoch": 0.002148094749182632,
2095
+ "grad_norm": 12.950918197631836,
2096
+ "learning_rate": 0.00019999979122076184,
2097
+ "loss": 1.6331,
2098
+ "step": 295
2099
+ },
2100
+ {
2101
+ "epoch": 0.0021553764262985053,
2102
+ "grad_norm": 12.525979995727539,
2103
+ "learning_rate": 0.00019999979122076184,
2104
+ "loss": 2.1505,
2105
+ "step": 296
2106
+ },
2107
+ {
2108
+ "epoch": 0.0021626581034143782,
2109
+ "grad_norm": 12.865964889526367,
2110
+ "learning_rate": 0.00019999979122076184,
2111
+ "loss": 2.4868,
2112
+ "step": 297
2113
+ },
2114
+ {
2115
+ "epoch": 0.0021699397805302516,
2116
+ "grad_norm": 12.397571563720703,
2117
+ "learning_rate": 0.0001999997766688466,
2118
+ "loss": 1.4126,
2119
+ "step": 298
2120
+ },
2121
+ {
2122
+ "epoch": 0.002177221457646125,
2123
+ "grad_norm": 8.017643928527832,
2124
+ "learning_rate": 0.0001999997766688466,
2125
+ "loss": 1.5486,
2126
+ "step": 299
2127
+ },
2128
+ {
2129
+ "epoch": 0.0021845031347619984,
2130
+ "grad_norm": 11.982081413269043,
2131
+ "learning_rate": 0.0001999997766688466,
2132
+ "loss": 1.3418,
2133
+ "step": 300
2134
+ },
2135
+ {
2136
+ "epoch": 0.0021845031347619984,
2137
+ "eval_loss": 1.8082797527313232,
2138
+ "eval_runtime": 36.2476,
2139
+ "eval_samples_per_second": 5.711,
2140
+ "eval_steps_per_second": 1.904,
2141
+ "step": 300
2142
  }
2143
  ],
2144
  "logging_steps": 1,
 
2153
  "early_stopping_threshold": 0.0
2154
  },
2155
  "attributes": {
2156
+ "early_stopping_patience_counter": 2
2157
  }
2158
  },
2159
  "TrainerControl": {
 
2167
  "attributes": {}
2168
  }
2169
  },
2170
+ "total_flos": 1.98915152412672e+16,
2171
  "train_batch_size": 3,
2172
  "trial_name": null,
2173
  "trial_params": null