minpeter commited on
Commit
c671bc1
·
verified ·
1 Parent(s): 5d4c972

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d11d0d50a5f319c00547314089e917e81cf7d4025ad36d223d9654b7fd00af18
3
  size 40001880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8af9875dba3abc72b08d5cc40e93518bd82b3e85f63b6cad6ce774111d3dff6
3
  size 40001880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daad3398daf503ff56b15ae48fea43ae113a26da4f3b05c6bdf650fa55832975
3
  size 40043787
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c652b40dc1376b5f1c6481c56235751d13ff2f60b4ee715e4f48755095005c2
3
  size 40043787
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f122671057b196d87919746ab31fde7751674c4ff91d4edbe8291727f123568
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793829d79b248c3a7b8954f2cd95073c2ba034f6ee2bb0edff8ce8fef88cb5ad
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1974eed1e05a84c5cdcaa5ca9e0c0e7cecb70b73f4e5db163c1db3a32122b7e
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954462bb708bcdba3c0732881adee53bb51d512303049568372c51a54b8cb129
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.14144271570014144,
6
  "eval_steps": 100,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1424,6 +1424,714 @@
1424
  "eval_samples_per_second": 149.98,
1425
  "eval_steps_per_second": 18.953,
1426
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1427
  }
1428
  ],
1429
  "logging_steps": 1,
@@ -1443,7 +2151,7 @@
1443
  "attributes": {}
1444
  }
1445
  },
1446
- "total_flos": 1.4851460038656e+16,
1447
  "train_batch_size": 128,
1448
  "trial_name": null,
1449
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.21216407355021216,
6
  "eval_steps": 100,
7
+ "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1424
  "eval_samples_per_second": 149.98,
1425
  "eval_steps_per_second": 18.953,
1426
  "step": 200
1427
+ },
1428
+ {
1429
+ "epoch": 0.14214992927864215,
1430
+ "grad_norm": 0.76953125,
1431
+ "learning_rate": 0.0008796665258991866,
1432
+ "loss": 10.2804,
1433
+ "step": 201
1434
+ },
1435
+ {
1436
+ "epoch": 0.14285714285714285,
1437
+ "grad_norm": 0.83203125,
1438
+ "learning_rate": 0.0008793524987925326,
1439
+ "loss": 10.2891,
1440
+ "step": 202
1441
+ },
1442
+ {
1443
+ "epoch": 0.14356435643564355,
1444
+ "grad_norm": 0.7890625,
1445
+ "learning_rate": 0.0008790361222612515,
1446
+ "loss": 10.2691,
1447
+ "step": 203
1448
+ },
1449
+ {
1450
+ "epoch": 0.14427157001414428,
1451
+ "grad_norm": 0.7890625,
1452
+ "learning_rate": 0.0008787173980365612,
1453
+ "loss": 10.2514,
1454
+ "step": 204
1455
+ },
1456
+ {
1457
+ "epoch": 0.144978783592645,
1458
+ "grad_norm": 0.796875,
1459
+ "learning_rate": 0.0008783963278625267,
1460
+ "loss": 10.264,
1461
+ "step": 205
1462
+ },
1463
+ {
1464
+ "epoch": 0.1456859971711457,
1465
+ "grad_norm": 0.7890625,
1466
+ "learning_rate": 0.0008780729134960495,
1467
+ "loss": 10.2424,
1468
+ "step": 206
1469
+ },
1470
+ {
1471
+ "epoch": 0.1463932107496464,
1472
+ "grad_norm": 0.74609375,
1473
+ "learning_rate": 0.000877747156706859,
1474
+ "loss": 10.2754,
1475
+ "step": 207
1476
+ },
1477
+ {
1478
+ "epoch": 0.1471004243281471,
1479
+ "grad_norm": 0.76171875,
1480
+ "learning_rate": 0.0008774190592775022,
1481
+ "loss": 10.2663,
1482
+ "step": 208
1483
+ },
1484
+ {
1485
+ "epoch": 0.1478076379066478,
1486
+ "grad_norm": 0.7421875,
1487
+ "learning_rate": 0.0008770886230033342,
1488
+ "loss": 10.255,
1489
+ "step": 209
1490
+ },
1491
+ {
1492
+ "epoch": 0.1485148514851485,
1493
+ "grad_norm": 0.78125,
1494
+ "learning_rate": 0.0008767558496925081,
1495
+ "loss": 10.2529,
1496
+ "step": 210
1497
+ },
1498
+ {
1499
+ "epoch": 0.1492220650636492,
1500
+ "grad_norm": 0.76953125,
1501
+ "learning_rate": 0.0008764207411659658,
1502
+ "loss": 10.2576,
1503
+ "step": 211
1504
+ },
1505
+ {
1506
+ "epoch": 0.14992927864214992,
1507
+ "grad_norm": 0.75390625,
1508
+ "learning_rate": 0.0008760832992574269,
1509
+ "loss": 10.243,
1510
+ "step": 212
1511
+ },
1512
+ {
1513
+ "epoch": 0.15063649222065065,
1514
+ "grad_norm": 0.7578125,
1515
+ "learning_rate": 0.0008757435258133798,
1516
+ "loss": 10.2374,
1517
+ "step": 213
1518
+ },
1519
+ {
1520
+ "epoch": 0.15134370579915135,
1521
+ "grad_norm": 0.7890625,
1522
+ "learning_rate": 0.0008754014226930707,
1523
+ "loss": 10.248,
1524
+ "step": 214
1525
+ },
1526
+ {
1527
+ "epoch": 0.15205091937765206,
1528
+ "grad_norm": 0.77734375,
1529
+ "learning_rate": 0.0008750569917684944,
1530
+ "loss": 10.2509,
1531
+ "step": 215
1532
+ },
1533
+ {
1534
+ "epoch": 0.15275813295615276,
1535
+ "grad_norm": 0.78125,
1536
+ "learning_rate": 0.0008747102349243827,
1537
+ "loss": 10.2583,
1538
+ "step": 216
1539
+ },
1540
+ {
1541
+ "epoch": 0.15346534653465346,
1542
+ "grad_norm": 0.78125,
1543
+ "learning_rate": 0.0008743611540581957,
1544
+ "loss": 10.2646,
1545
+ "step": 217
1546
+ },
1547
+ {
1548
+ "epoch": 0.15417256011315417,
1549
+ "grad_norm": 0.80859375,
1550
+ "learning_rate": 0.00087400975108011,
1551
+ "loss": 10.2393,
1552
+ "step": 218
1553
+ },
1554
+ {
1555
+ "epoch": 0.15487977369165487,
1556
+ "grad_norm": 0.796875,
1557
+ "learning_rate": 0.0008736560279130091,
1558
+ "loss": 10.2518,
1559
+ "step": 219
1560
+ },
1561
+ {
1562
+ "epoch": 0.15558698727015557,
1563
+ "grad_norm": 0.7265625,
1564
+ "learning_rate": 0.0008732999864924726,
1565
+ "loss": 10.2396,
1566
+ "step": 220
1567
+ },
1568
+ {
1569
+ "epoch": 0.1562942008486563,
1570
+ "grad_norm": 0.7578125,
1571
+ "learning_rate": 0.0008729416287667654,
1572
+ "loss": 10.2418,
1573
+ "step": 221
1574
+ },
1575
+ {
1576
+ "epoch": 0.157001414427157,
1577
+ "grad_norm": 0.79296875,
1578
+ "learning_rate": 0.0008725809566968277,
1579
+ "loss": 10.2397,
1580
+ "step": 222
1581
+ },
1582
+ {
1583
+ "epoch": 0.15770862800565771,
1584
+ "grad_norm": 0.8046875,
1585
+ "learning_rate": 0.0008722179722562636,
1586
+ "loss": 10.2309,
1587
+ "step": 223
1588
+ },
1589
+ {
1590
+ "epoch": 0.15841584158415842,
1591
+ "grad_norm": 0.78515625,
1592
+ "learning_rate": 0.0008718526774313301,
1593
+ "loss": 10.248,
1594
+ "step": 224
1595
+ },
1596
+ {
1597
+ "epoch": 0.15912305516265912,
1598
+ "grad_norm": 0.796875,
1599
+ "learning_rate": 0.0008714850742209275,
1600
+ "loss": 10.232,
1601
+ "step": 225
1602
+ },
1603
+ {
1604
+ "epoch": 0.15983026874115983,
1605
+ "grad_norm": 0.7890625,
1606
+ "learning_rate": 0.000871115164636587,
1607
+ "loss": 10.261,
1608
+ "step": 226
1609
+ },
1610
+ {
1611
+ "epoch": 0.16053748231966053,
1612
+ "grad_norm": 0.82421875,
1613
+ "learning_rate": 0.0008707429507024607,
1614
+ "loss": 10.2448,
1615
+ "step": 227
1616
+ },
1617
+ {
1618
+ "epoch": 0.16124469589816123,
1619
+ "grad_norm": 0.8125,
1620
+ "learning_rate": 0.0008703684344553098,
1621
+ "loss": 10.2303,
1622
+ "step": 228
1623
+ },
1624
+ {
1625
+ "epoch": 0.16195190947666196,
1626
+ "grad_norm": 0.76171875,
1627
+ "learning_rate": 0.000869991617944494,
1628
+ "loss": 10.2336,
1629
+ "step": 229
1630
+ },
1631
+ {
1632
+ "epoch": 0.16265912305516267,
1633
+ "grad_norm": 0.7890625,
1634
+ "learning_rate": 0.0008696125032319601,
1635
+ "loss": 10.2486,
1636
+ "step": 230
1637
+ },
1638
+ {
1639
+ "epoch": 0.16336633663366337,
1640
+ "grad_norm": 0.78125,
1641
+ "learning_rate": 0.0008692310923922306,
1642
+ "loss": 10.2299,
1643
+ "step": 231
1644
+ },
1645
+ {
1646
+ "epoch": 0.16407355021216408,
1647
+ "grad_norm": 0.75390625,
1648
+ "learning_rate": 0.0008688473875123925,
1649
+ "loss": 10.2432,
1650
+ "step": 232
1651
+ },
1652
+ {
1653
+ "epoch": 0.16478076379066478,
1654
+ "grad_norm": 0.77734375,
1655
+ "learning_rate": 0.000868461390692086,
1656
+ "loss": 10.2641,
1657
+ "step": 233
1658
+ },
1659
+ {
1660
+ "epoch": 0.16548797736916548,
1661
+ "grad_norm": 0.79296875,
1662
+ "learning_rate": 0.0008680731040434925,
1663
+ "loss": 10.2143,
1664
+ "step": 234
1665
+ },
1666
+ {
1667
+ "epoch": 0.1661951909476662,
1668
+ "grad_norm": 0.75390625,
1669
+ "learning_rate": 0.0008676825296913235,
1670
+ "loss": 10.2482,
1671
+ "step": 235
1672
+ },
1673
+ {
1674
+ "epoch": 0.1669024045261669,
1675
+ "grad_norm": 0.78515625,
1676
+ "learning_rate": 0.0008672896697728091,
1677
+ "loss": 10.2266,
1678
+ "step": 236
1679
+ },
1680
+ {
1681
+ "epoch": 0.1676096181046676,
1682
+ "grad_norm": 0.8046875,
1683
+ "learning_rate": 0.0008668945264376857,
1684
+ "loss": 10.2357,
1685
+ "step": 237
1686
+ },
1687
+ {
1688
+ "epoch": 0.16831683168316833,
1689
+ "grad_norm": 0.79296875,
1690
+ "learning_rate": 0.0008664971018481848,
1691
+ "loss": 10.2237,
1692
+ "step": 238
1693
+ },
1694
+ {
1695
+ "epoch": 0.16902404526166903,
1696
+ "grad_norm": 0.7890625,
1697
+ "learning_rate": 0.000866097398179021,
1698
+ "loss": 10.2447,
1699
+ "step": 239
1700
+ },
1701
+ {
1702
+ "epoch": 0.16973125884016974,
1703
+ "grad_norm": 0.77734375,
1704
+ "learning_rate": 0.00086569541761738,
1705
+ "loss": 10.2407,
1706
+ "step": 240
1707
+ },
1708
+ {
1709
+ "epoch": 0.17043847241867044,
1710
+ "grad_norm": 0.7734375,
1711
+ "learning_rate": 0.0008652911623629067,
1712
+ "loss": 10.2269,
1713
+ "step": 241
1714
+ },
1715
+ {
1716
+ "epoch": 0.17114568599717114,
1717
+ "grad_norm": 0.7890625,
1718
+ "learning_rate": 0.000864884634627693,
1719
+ "loss": 10.2356,
1720
+ "step": 242
1721
+ },
1722
+ {
1723
+ "epoch": 0.17185289957567185,
1724
+ "grad_norm": 0.78125,
1725
+ "learning_rate": 0.0008644758366362661,
1726
+ "loss": 10.2402,
1727
+ "step": 243
1728
+ },
1729
+ {
1730
+ "epoch": 0.17256011315417255,
1731
+ "grad_norm": 0.78515625,
1732
+ "learning_rate": 0.0008640647706255762,
1733
+ "loss": 10.2485,
1734
+ "step": 244
1735
+ },
1736
+ {
1737
+ "epoch": 0.17326732673267325,
1738
+ "grad_norm": 0.765625,
1739
+ "learning_rate": 0.0008636514388449835,
1740
+ "loss": 10.2711,
1741
+ "step": 245
1742
+ },
1743
+ {
1744
+ "epoch": 0.173974540311174,
1745
+ "grad_norm": 0.78515625,
1746
+ "learning_rate": 0.0008632358435562474,
1747
+ "loss": 10.2419,
1748
+ "step": 246
1749
+ },
1750
+ {
1751
+ "epoch": 0.1746817538896747,
1752
+ "grad_norm": 0.7734375,
1753
+ "learning_rate": 0.0008628179870335125,
1754
+ "loss": 10.2475,
1755
+ "step": 247
1756
+ },
1757
+ {
1758
+ "epoch": 0.1753889674681754,
1759
+ "grad_norm": 0.78515625,
1760
+ "learning_rate": 0.0008623978715632973,
1761
+ "loss": 10.2354,
1762
+ "step": 248
1763
+ },
1764
+ {
1765
+ "epoch": 0.1760961810466761,
1766
+ "grad_norm": 0.8046875,
1767
+ "learning_rate": 0.0008619754994444814,
1768
+ "loss": 10.2518,
1769
+ "step": 249
1770
+ },
1771
+ {
1772
+ "epoch": 0.1768033946251768,
1773
+ "grad_norm": 0.84765625,
1774
+ "learning_rate": 0.0008615508729882928,
1775
+ "loss": 10.2316,
1776
+ "step": 250
1777
+ },
1778
+ {
1779
+ "epoch": 0.1775106082036775,
1780
+ "grad_norm": 0.8046875,
1781
+ "learning_rate": 0.0008611239945182946,
1782
+ "loss": 10.2365,
1783
+ "step": 251
1784
+ },
1785
+ {
1786
+ "epoch": 0.1782178217821782,
1787
+ "grad_norm": 0.7890625,
1788
+ "learning_rate": 0.0008606948663703739,
1789
+ "loss": 10.2641,
1790
+ "step": 252
1791
+ },
1792
+ {
1793
+ "epoch": 0.1789250353606789,
1794
+ "grad_norm": 0.80078125,
1795
+ "learning_rate": 0.0008602634908927277,
1796
+ "loss": 10.2601,
1797
+ "step": 253
1798
+ },
1799
+ {
1800
+ "epoch": 0.17963224893917965,
1801
+ "grad_norm": 0.80859375,
1802
+ "learning_rate": 0.0008598298704458502,
1803
+ "loss": 10.2662,
1804
+ "step": 254
1805
+ },
1806
+ {
1807
+ "epoch": 0.18033946251768035,
1808
+ "grad_norm": 0.80859375,
1809
+ "learning_rate": 0.0008593940074025203,
1810
+ "loss": 10.2693,
1811
+ "step": 255
1812
+ },
1813
+ {
1814
+ "epoch": 0.18104667609618105,
1815
+ "grad_norm": 0.78515625,
1816
+ "learning_rate": 0.0008589559041477887,
1817
+ "loss": 10.2418,
1818
+ "step": 256
1819
+ },
1820
+ {
1821
+ "epoch": 0.18175388967468176,
1822
+ "grad_norm": 0.82421875,
1823
+ "learning_rate": 0.000858515563078964,
1824
+ "loss": 10.2431,
1825
+ "step": 257
1826
+ },
1827
+ {
1828
+ "epoch": 0.18246110325318246,
1829
+ "grad_norm": 0.8046875,
1830
+ "learning_rate": 0.0008580729866056009,
1831
+ "loss": 10.2722,
1832
+ "step": 258
1833
+ },
1834
+ {
1835
+ "epoch": 0.18316831683168316,
1836
+ "grad_norm": 0.83203125,
1837
+ "learning_rate": 0.0008576281771494854,
1838
+ "loss": 10.2565,
1839
+ "step": 259
1840
+ },
1841
+ {
1842
+ "epoch": 0.18387553041018387,
1843
+ "grad_norm": 0.84765625,
1844
+ "learning_rate": 0.0008571811371446231,
1845
+ "loss": 10.2524,
1846
+ "step": 260
1847
+ },
1848
+ {
1849
+ "epoch": 0.18458274398868457,
1850
+ "grad_norm": 0.80078125,
1851
+ "learning_rate": 0.0008567318690372251,
1852
+ "loss": 10.2671,
1853
+ "step": 261
1854
+ },
1855
+ {
1856
+ "epoch": 0.18528995756718528,
1857
+ "grad_norm": 0.8984375,
1858
+ "learning_rate": 0.0008562803752856944,
1859
+ "loss": 10.267,
1860
+ "step": 262
1861
+ },
1862
+ {
1863
+ "epoch": 0.185997171145686,
1864
+ "grad_norm": 0.81640625,
1865
+ "learning_rate": 0.000855826658360613,
1866
+ "loss": 10.2668,
1867
+ "step": 263
1868
+ },
1869
+ {
1870
+ "epoch": 0.1867043847241867,
1871
+ "grad_norm": 0.80078125,
1872
+ "learning_rate": 0.000855370720744728,
1873
+ "loss": 10.2517,
1874
+ "step": 264
1875
+ },
1876
+ {
1877
+ "epoch": 0.18741159830268742,
1878
+ "grad_norm": 0.85546875,
1879
+ "learning_rate": 0.0008549125649329386,
1880
+ "loss": 10.256,
1881
+ "step": 265
1882
+ },
1883
+ {
1884
+ "epoch": 0.18811881188118812,
1885
+ "grad_norm": 0.83984375,
1886
+ "learning_rate": 0.0008544521934322814,
1887
+ "loss": 10.2607,
1888
+ "step": 266
1889
+ },
1890
+ {
1891
+ "epoch": 0.18882602545968882,
1892
+ "grad_norm": 0.8203125,
1893
+ "learning_rate": 0.0008539896087619176,
1894
+ "loss": 10.2778,
1895
+ "step": 267
1896
+ },
1897
+ {
1898
+ "epoch": 0.18953323903818953,
1899
+ "grad_norm": 0.859375,
1900
+ "learning_rate": 0.0008535248134531189,
1901
+ "loss": 10.2601,
1902
+ "step": 268
1903
+ },
1904
+ {
1905
+ "epoch": 0.19024045261669023,
1906
+ "grad_norm": 0.84765625,
1907
+ "learning_rate": 0.0008530578100492538,
1908
+ "loss": 10.2399,
1909
+ "step": 269
1910
+ },
1911
+ {
1912
+ "epoch": 0.19094766619519093,
1913
+ "grad_norm": 0.828125,
1914
+ "learning_rate": 0.000852588601105773,
1915
+ "loss": 10.2647,
1916
+ "step": 270
1917
+ },
1918
+ {
1919
+ "epoch": 0.19165487977369167,
1920
+ "grad_norm": 0.8203125,
1921
+ "learning_rate": 0.0008521171891901965,
1922
+ "loss": 10.265,
1923
+ "step": 271
1924
+ },
1925
+ {
1926
+ "epoch": 0.19236209335219237,
1927
+ "grad_norm": 0.796875,
1928
+ "learning_rate": 0.000851643576882099,
1929
+ "loss": 10.2793,
1930
+ "step": 272
1931
+ },
1932
+ {
1933
+ "epoch": 0.19306930693069307,
1934
+ "grad_norm": 0.859375,
1935
+ "learning_rate": 0.0008511677667730952,
1936
+ "loss": 10.283,
1937
+ "step": 273
1938
+ },
1939
+ {
1940
+ "epoch": 0.19377652050919378,
1941
+ "grad_norm": 0.85546875,
1942
+ "learning_rate": 0.000850689761466827,
1943
+ "loss": 10.2648,
1944
+ "step": 274
1945
+ },
1946
+ {
1947
+ "epoch": 0.19448373408769448,
1948
+ "grad_norm": 0.8671875,
1949
+ "learning_rate": 0.0008502095635789478,
1950
+ "loss": 10.2946,
1951
+ "step": 275
1952
+ },
1953
+ {
1954
+ "epoch": 0.19519094766619519,
1955
+ "grad_norm": 0.86328125,
1956
+ "learning_rate": 0.0008497271757371093,
1957
+ "loss": 10.2982,
1958
+ "step": 276
1959
+ },
1960
+ {
1961
+ "epoch": 0.1958981612446959,
1962
+ "grad_norm": 0.8203125,
1963
+ "learning_rate": 0.0008492426005809464,
1964
+ "loss": 10.2918,
1965
+ "step": 277
1966
+ },
1967
+ {
1968
+ "epoch": 0.1966053748231966,
1969
+ "grad_norm": 0.84375,
1970
+ "learning_rate": 0.0008487558407620629,
1971
+ "loss": 10.2779,
1972
+ "step": 278
1973
+ },
1974
+ {
1975
+ "epoch": 0.19731258840169733,
1976
+ "grad_norm": 0.859375,
1977
+ "learning_rate": 0.0008482668989440178,
1978
+ "loss": 10.294,
1979
+ "step": 279
1980
+ },
1981
+ {
1982
+ "epoch": 0.19801980198019803,
1983
+ "grad_norm": 0.859375,
1984
+ "learning_rate": 0.0008477757778023092,
1985
+ "loss": 10.2892,
1986
+ "step": 280
1987
+ },
1988
+ {
1989
+ "epoch": 0.19872701555869873,
1990
+ "grad_norm": 0.828125,
1991
+ "learning_rate": 0.0008472824800243608,
1992
+ "loss": 10.3013,
1993
+ "step": 281
1994
+ },
1995
+ {
1996
+ "epoch": 0.19943422913719944,
1997
+ "grad_norm": 0.8515625,
1998
+ "learning_rate": 0.0008467870083095073,
1999
+ "loss": 10.3026,
2000
+ "step": 282
2001
+ },
2002
+ {
2003
+ "epoch": 0.20014144271570014,
2004
+ "grad_norm": 0.84375,
2005
+ "learning_rate": 0.0008462893653689785,
2006
+ "loss": 10.3071,
2007
+ "step": 283
2008
+ },
2009
+ {
2010
+ "epoch": 0.20084865629420084,
2011
+ "grad_norm": 0.87890625,
2012
+ "learning_rate": 0.0008457895539258857,
2013
+ "loss": 10.3179,
2014
+ "step": 284
2015
+ },
2016
+ {
2017
+ "epoch": 0.20155586987270155,
2018
+ "grad_norm": 0.88671875,
2019
+ "learning_rate": 0.0008452875767152062,
2020
+ "loss": 10.2782,
2021
+ "step": 285
2022
+ },
2023
+ {
2024
+ "epoch": 0.20226308345120225,
2025
+ "grad_norm": 0.87109375,
2026
+ "learning_rate": 0.0008447834364837685,
2027
+ "loss": 10.3004,
2028
+ "step": 286
2029
+ },
2030
+ {
2031
+ "epoch": 0.20297029702970298,
2032
+ "grad_norm": 0.8515625,
2033
+ "learning_rate": 0.0008442771359902366,
2034
+ "loss": 10.317,
2035
+ "step": 287
2036
+ },
2037
+ {
2038
+ "epoch": 0.2036775106082037,
2039
+ "grad_norm": 0.86328125,
2040
+ "learning_rate": 0.0008437686780050964,
2041
+ "loss": 10.3256,
2042
+ "step": 288
2043
+ },
2044
+ {
2045
+ "epoch": 0.2043847241867044,
2046
+ "grad_norm": 0.859375,
2047
+ "learning_rate": 0.0008432580653106389,
2048
+ "loss": 10.3102,
2049
+ "step": 289
2050
+ },
2051
+ {
2052
+ "epoch": 0.2050919377652051,
2053
+ "grad_norm": 0.84375,
2054
+ "learning_rate": 0.000842745300700946,
2055
+ "loss": 10.3125,
2056
+ "step": 290
2057
+ },
2058
+ {
2059
+ "epoch": 0.2057991513437058,
2060
+ "grad_norm": 0.859375,
2061
+ "learning_rate": 0.0008422303869818752,
2062
+ "loss": 10.325,
2063
+ "step": 291
2064
+ },
2065
+ {
2066
+ "epoch": 0.2065063649222065,
2067
+ "grad_norm": 0.8671875,
2068
+ "learning_rate": 0.0008417133269710432,
2069
+ "loss": 10.3293,
2070
+ "step": 292
2071
+ },
2072
+ {
2073
+ "epoch": 0.2072135785007072,
2074
+ "grad_norm": 0.8828125,
2075
+ "learning_rate": 0.0008411941234978122,
2076
+ "loss": 10.3085,
2077
+ "step": 293
2078
+ },
2079
+ {
2080
+ "epoch": 0.2079207920792079,
2081
+ "grad_norm": 0.86328125,
2082
+ "learning_rate": 0.0008406727794032725,
2083
+ "loss": 10.3144,
2084
+ "step": 294
2085
+ },
2086
+ {
2087
+ "epoch": 0.20862800565770862,
2088
+ "grad_norm": 0.90234375,
2089
+ "learning_rate": 0.0008401492975402288,
2090
+ "loss": 10.3095,
2091
+ "step": 295
2092
+ },
2093
+ {
2094
+ "epoch": 0.20933521923620935,
2095
+ "grad_norm": 0.85546875,
2096
+ "learning_rate": 0.0008396236807731831,
2097
+ "loss": 10.3412,
2098
+ "step": 296
2099
+ },
2100
+ {
2101
+ "epoch": 0.21004243281471005,
2102
+ "grad_norm": 0.859375,
2103
+ "learning_rate": 0.00083909593197832,
2104
+ "loss": 10.3543,
2105
+ "step": 297
2106
+ },
2107
+ {
2108
+ "epoch": 0.21074964639321075,
2109
+ "grad_norm": 0.86328125,
2110
+ "learning_rate": 0.0008385660540434904,
2111
+ "loss": 10.3612,
2112
+ "step": 298
2113
+ },
2114
+ {
2115
+ "epoch": 0.21145685997171146,
2116
+ "grad_norm": 0.890625,
2117
+ "learning_rate": 0.0008380340498681957,
2118
+ "loss": 10.3614,
2119
+ "step": 299
2120
+ },
2121
+ {
2122
+ "epoch": 0.21216407355021216,
2123
+ "grad_norm": 0.94140625,
2124
+ "learning_rate": 0.0008374999223635726,
2125
+ "loss": 10.3179,
2126
+ "step": 300
2127
+ },
2128
+ {
2129
+ "epoch": 0.21216407355021216,
2130
+ "eval_loss": 10.361818313598633,
2131
+ "eval_runtime": 1.2522,
2132
+ "eval_samples_per_second": 145.349,
2133
+ "eval_steps_per_second": 18.368,
2134
+ "step": 300
2135
  }
2136
  ],
2137
  "logging_steps": 1,
 
2151
  "attributes": {}
2152
  }
2153
  },
2154
+ "total_flos": 2.2277190057984e+16,
2155
  "train_batch_size": 128,
2156
  "trial_name": null,
2157
  "trial_params": null