irodkin commited on
Commit
6eaa560
·
verified ·
1 Parent(s): 6e8c494

Training checkpoint at step 5000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 4000,
3
- "best_metric": 2.430954933166504,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-4000",
5
- "epoch": 0.08,
6
  "eval_steps": 100,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1448,6 +1448,366 @@
1448
  "eval_samples_per_second": 3.532,
1449
  "eval_steps_per_second": 1.781,
1450
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1451
  }
1452
  ],
1453
  "logging_steps": 25,
@@ -1467,7 +1827,7 @@
1467
  "attributes": {}
1468
  }
1469
  },
1470
- "total_flos": 1.114120770054783e+19,
1471
  "train_batch_size": 1,
1472
  "trial_name": null,
1473
  "trial_params": null
 
1
  {
2
+ "best_global_step": 5000,
3
+ "best_metric": 2.426590919494629,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000",
5
+ "epoch": 0.1,
6
  "eval_steps": 100,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1448
  "eval_samples_per_second": 3.532,
1449
  "eval_steps_per_second": 1.781,
1450
  "step": 4000
1451
+ },
1452
+ {
1453
+ "epoch": 0.0805,
1454
+ "grad_norm": 0.021865944897834468,
1455
+ "learning_rate": 8.048e-06,
1456
+ "loss": 2.4283,
1457
+ "step": 4025
1458
+ },
1459
+ {
1460
+ "epoch": 0.081,
1461
+ "grad_norm": 0.020393010409248808,
1462
+ "learning_rate": 8.098000000000001e-06,
1463
+ "loss": 2.4142,
1464
+ "step": 4050
1465
+ },
1466
+ {
1467
+ "epoch": 0.0815,
1468
+ "grad_norm": 0.02279155824698799,
1469
+ "learning_rate": 8.148e-06,
1470
+ "loss": 2.4208,
1471
+ "step": 4075
1472
+ },
1473
+ {
1474
+ "epoch": 0.082,
1475
+ "grad_norm": 0.021110562493101104,
1476
+ "learning_rate": 8.198e-06,
1477
+ "loss": 2.4093,
1478
+ "step": 4100
1479
+ },
1480
+ {
1481
+ "epoch": 0.082,
1482
+ "eval_loss": 2.4299628734588623,
1483
+ "eval_runtime": 33.2215,
1484
+ "eval_samples_per_second": 3.522,
1485
+ "eval_steps_per_second": 1.776,
1486
+ "step": 4100
1487
+ },
1488
+ {
1489
+ "epoch": 0.0825,
1490
+ "grad_norm": 0.019752507861163327,
1491
+ "learning_rate": 8.248e-06,
1492
+ "loss": 2.4073,
1493
+ "step": 4125
1494
+ },
1495
+ {
1496
+ "epoch": 0.083,
1497
+ "grad_norm": 0.019897433088879975,
1498
+ "learning_rate": 8.298000000000001e-06,
1499
+ "loss": 2.4129,
1500
+ "step": 4150
1501
+ },
1502
+ {
1503
+ "epoch": 0.0835,
1504
+ "grad_norm": 0.02275241957806373,
1505
+ "learning_rate": 8.348e-06,
1506
+ "loss": 2.4243,
1507
+ "step": 4175
1508
+ },
1509
+ {
1510
+ "epoch": 0.084,
1511
+ "grad_norm": 0.02009113389579191,
1512
+ "learning_rate": 8.398e-06,
1513
+ "loss": 2.4138,
1514
+ "step": 4200
1515
+ },
1516
+ {
1517
+ "epoch": 0.084,
1518
+ "eval_loss": 2.4301230907440186,
1519
+ "eval_runtime": 33.0641,
1520
+ "eval_samples_per_second": 3.539,
1521
+ "eval_steps_per_second": 1.784,
1522
+ "step": 4200
1523
+ },
1524
+ {
1525
+ "epoch": 0.0845,
1526
+ "grad_norm": 0.021259070586902896,
1527
+ "learning_rate": 8.448000000000001e-06,
1528
+ "loss": 2.4212,
1529
+ "step": 4225
1530
+ },
1531
+ {
1532
+ "epoch": 0.085,
1533
+ "grad_norm": 0.021461643865178466,
1534
+ "learning_rate": 8.498e-06,
1535
+ "loss": 2.4242,
1536
+ "step": 4250
1537
+ },
1538
+ {
1539
+ "epoch": 0.0855,
1540
+ "grad_norm": 0.02129278617109427,
1541
+ "learning_rate": 8.548e-06,
1542
+ "loss": 2.4153,
1543
+ "step": 4275
1544
+ },
1545
+ {
1546
+ "epoch": 0.086,
1547
+ "grad_norm": 0.019884381961586706,
1548
+ "learning_rate": 8.598000000000001e-06,
1549
+ "loss": 2.4107,
1550
+ "step": 4300
1551
+ },
1552
+ {
1553
+ "epoch": 0.086,
1554
+ "eval_loss": 2.429638385772705,
1555
+ "eval_runtime": 33.1452,
1556
+ "eval_samples_per_second": 3.53,
1557
+ "eval_steps_per_second": 1.78,
1558
+ "step": 4300
1559
+ },
1560
+ {
1561
+ "epoch": 0.0865,
1562
+ "grad_norm": 0.02127578557225418,
1563
+ "learning_rate": 8.648000000000001e-06,
1564
+ "loss": 2.4202,
1565
+ "step": 4325
1566
+ },
1567
+ {
1568
+ "epoch": 0.087,
1569
+ "grad_norm": 0.021749788475476855,
1570
+ "learning_rate": 8.698e-06,
1571
+ "loss": 2.4274,
1572
+ "step": 4350
1573
+ },
1574
+ {
1575
+ "epoch": 0.0875,
1576
+ "grad_norm": 0.021521494708913836,
1577
+ "learning_rate": 8.748000000000002e-06,
1578
+ "loss": 2.4189,
1579
+ "step": 4375
1580
+ },
1581
+ {
1582
+ "epoch": 0.088,
1583
+ "grad_norm": 0.021276426458537334,
1584
+ "learning_rate": 8.798000000000001e-06,
1585
+ "loss": 2.4152,
1586
+ "step": 4400
1587
+ },
1588
+ {
1589
+ "epoch": 0.088,
1590
+ "eval_loss": 2.4292917251586914,
1591
+ "eval_runtime": 33.1057,
1592
+ "eval_samples_per_second": 3.534,
1593
+ "eval_steps_per_second": 1.782,
1594
+ "step": 4400
1595
+ },
1596
+ {
1597
+ "epoch": 0.0885,
1598
+ "grad_norm": 0.019843371943772815,
1599
+ "learning_rate": 8.848e-06,
1600
+ "loss": 2.421,
1601
+ "step": 4425
1602
+ },
1603
+ {
1604
+ "epoch": 0.089,
1605
+ "grad_norm": 0.02031045171970109,
1606
+ "learning_rate": 8.898000000000002e-06,
1607
+ "loss": 2.4201,
1608
+ "step": 4450
1609
+ },
1610
+ {
1611
+ "epoch": 0.0895,
1612
+ "grad_norm": 0.018642717079241176,
1613
+ "learning_rate": 8.948000000000001e-06,
1614
+ "loss": 2.4171,
1615
+ "step": 4475
1616
+ },
1617
+ {
1618
+ "epoch": 0.09,
1619
+ "grad_norm": 0.021016901396559935,
1620
+ "learning_rate": 8.998000000000001e-06,
1621
+ "loss": 2.4257,
1622
+ "step": 4500
1623
+ },
1624
+ {
1625
+ "epoch": 0.09,
1626
+ "eval_loss": 2.4288113117218018,
1627
+ "eval_runtime": 33.1217,
1628
+ "eval_samples_per_second": 3.532,
1629
+ "eval_steps_per_second": 1.781,
1630
+ "step": 4500
1631
+ },
1632
+ {
1633
+ "epoch": 0.0905,
1634
+ "grad_norm": 0.021595090834222327,
1635
+ "learning_rate": 9.048e-06,
1636
+ "loss": 2.4209,
1637
+ "step": 4525
1638
+ },
1639
+ {
1640
+ "epoch": 0.091,
1641
+ "grad_norm": 0.020500341653961213,
1642
+ "learning_rate": 9.098000000000002e-06,
1643
+ "loss": 2.4093,
1644
+ "step": 4550
1645
+ },
1646
+ {
1647
+ "epoch": 0.0915,
1648
+ "grad_norm": 0.021134665935359346,
1649
+ "learning_rate": 9.148e-06,
1650
+ "loss": 2.4238,
1651
+ "step": 4575
1652
+ },
1653
+ {
1654
+ "epoch": 0.092,
1655
+ "grad_norm": 0.018064298488706988,
1656
+ "learning_rate": 9.198e-06,
1657
+ "loss": 2.4163,
1658
+ "step": 4600
1659
+ },
1660
+ {
1661
+ "epoch": 0.092,
1662
+ "eval_loss": 2.428257465362549,
1663
+ "eval_runtime": 33.451,
1664
+ "eval_samples_per_second": 3.498,
1665
+ "eval_steps_per_second": 1.764,
1666
+ "step": 4600
1667
+ },
1668
+ {
1669
+ "epoch": 0.0925,
1670
+ "grad_norm": 0.019704962175624032,
1671
+ "learning_rate": 9.248e-06,
1672
+ "loss": 2.4082,
1673
+ "step": 4625
1674
+ },
1675
+ {
1676
+ "epoch": 0.093,
1677
+ "grad_norm": 0.019712333508134283,
1678
+ "learning_rate": 9.298e-06,
1679
+ "loss": 2.4089,
1680
+ "step": 4650
1681
+ },
1682
+ {
1683
+ "epoch": 0.0935,
1684
+ "grad_norm": 0.021269463834833153,
1685
+ "learning_rate": 9.348000000000001e-06,
1686
+ "loss": 2.408,
1687
+ "step": 4675
1688
+ },
1689
+ {
1690
+ "epoch": 0.094,
1691
+ "grad_norm": 0.021278662940784676,
1692
+ "learning_rate": 9.398e-06,
1693
+ "loss": 2.4189,
1694
+ "step": 4700
1695
+ },
1696
+ {
1697
+ "epoch": 0.094,
1698
+ "eval_loss": 2.4279165267944336,
1699
+ "eval_runtime": 33.1606,
1700
+ "eval_samples_per_second": 3.528,
1701
+ "eval_steps_per_second": 1.779,
1702
+ "step": 4700
1703
+ },
1704
+ {
1705
+ "epoch": 0.0945,
1706
+ "grad_norm": 0.018504564797986272,
1707
+ "learning_rate": 9.448e-06,
1708
+ "loss": 2.4254,
1709
+ "step": 4725
1710
+ },
1711
+ {
1712
+ "epoch": 0.095,
1713
+ "grad_norm": 0.01917099113509997,
1714
+ "learning_rate": 9.498000000000001e-06,
1715
+ "loss": 2.411,
1716
+ "step": 4750
1717
+ },
1718
+ {
1719
+ "epoch": 0.0955,
1720
+ "grad_norm": 0.019097394482211122,
1721
+ "learning_rate": 9.548e-06,
1722
+ "loss": 2.4209,
1723
+ "step": 4775
1724
+ },
1725
+ {
1726
+ "epoch": 0.096,
1727
+ "grad_norm": 0.020220692469392707,
1728
+ "learning_rate": 9.598e-06,
1729
+ "loss": 2.4066,
1730
+ "step": 4800
1731
+ },
1732
+ {
1733
+ "epoch": 0.096,
1734
+ "eval_loss": 2.4273650646209717,
1735
+ "eval_runtime": 33.1079,
1736
+ "eval_samples_per_second": 3.534,
1737
+ "eval_steps_per_second": 1.782,
1738
+ "step": 4800
1739
+ },
1740
+ {
1741
+ "epoch": 0.0965,
1742
+ "grad_norm": 0.019607148490934756,
1743
+ "learning_rate": 9.648000000000001e-06,
1744
+ "loss": 2.4132,
1745
+ "step": 4825
1746
+ },
1747
+ {
1748
+ "epoch": 0.097,
1749
+ "grad_norm": 0.019388710503851023,
1750
+ "learning_rate": 9.698000000000001e-06,
1751
+ "loss": 2.4096,
1752
+ "step": 4850
1753
+ },
1754
+ {
1755
+ "epoch": 0.0975,
1756
+ "grad_norm": 0.019593746411763164,
1757
+ "learning_rate": 9.748e-06,
1758
+ "loss": 2.4064,
1759
+ "step": 4875
1760
+ },
1761
+ {
1762
+ "epoch": 0.098,
1763
+ "grad_norm": 0.018761734791343965,
1764
+ "learning_rate": 9.798e-06,
1765
+ "loss": 2.4033,
1766
+ "step": 4900
1767
+ },
1768
+ {
1769
+ "epoch": 0.098,
1770
+ "eval_loss": 2.4270286560058594,
1771
+ "eval_runtime": 33.0269,
1772
+ "eval_samples_per_second": 3.543,
1773
+ "eval_steps_per_second": 1.786,
1774
+ "step": 4900
1775
+ },
1776
+ {
1777
+ "epoch": 0.0985,
1778
+ "grad_norm": 0.018964507342139367,
1779
+ "learning_rate": 9.848000000000001e-06,
1780
+ "loss": 2.4211,
1781
+ "step": 4925
1782
+ },
1783
+ {
1784
+ "epoch": 0.099,
1785
+ "grad_norm": 0.01858861943184826,
1786
+ "learning_rate": 9.898e-06,
1787
+ "loss": 2.4032,
1788
+ "step": 4950
1789
+ },
1790
+ {
1791
+ "epoch": 0.0995,
1792
+ "grad_norm": 0.01821023564956819,
1793
+ "learning_rate": 9.948e-06,
1794
+ "loss": 2.4031,
1795
+ "step": 4975
1796
+ },
1797
+ {
1798
+ "epoch": 0.1,
1799
+ "grad_norm": 0.018839474555921314,
1800
+ "learning_rate": 9.998000000000002e-06,
1801
+ "loss": 2.4112,
1802
+ "step": 5000
1803
+ },
1804
+ {
1805
+ "epoch": 0.1,
1806
+ "eval_loss": 2.426590919494629,
1807
+ "eval_runtime": 33.0133,
1808
+ "eval_samples_per_second": 3.544,
1809
+ "eval_steps_per_second": 1.787,
1810
+ "step": 5000
1811
  }
1812
  ],
1813
  "logging_steps": 25,
 
1827
  "attributes": {}
1828
  }
1829
  },
1830
+ "total_flos": 1.3926509625684787e+19,
1831
  "train_batch_size": 1,
1832
  "trial_name": null,
1833
  "trial_params": null