Wilsonwin commited on
Commit
fadff9c
·
verified ·
1 Parent(s): c997a34

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b30006c3c8ebdd220eda160d67d570192e678e4b938a46729d63d00fc226c89
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35a25838e882b8b8d3abc2c30fca06831b937a101be8b7eca174b157589ba0d0
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d83b910297466c079691649d9d51db171a5eff2b984ed10840ddd4d5cf17b1d
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08742255bcc023f4a34f8f2a127617bb854ab5fb96a4602c10f2895bfc656f64
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a04575953c998a8fd3197b1b8249c8e72c33f4bb7c27b036788a4d9e537cf3cd
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5f5054d1fb89b5c064db193ff9ee8b30b57ffe17a11e00d28cfa91ea00081e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.33789491468153404,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1447,6 +1447,364 @@
1447
  "eval_samples_per_second": 255.761,
1448
  "eval_steps_per_second": 5.371,
1449
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  }
1451
  ],
1452
  "logging_steps": 10,
@@ -1466,7 +1824,7 @@
1466
  "attributes": {}
1467
  }
1468
  },
1469
- "total_flos": 6.6891364171776e+16,
1470
  "train_batch_size": 48,
1471
  "trial_name": null,
1472
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.42236864335191754,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1447
  "eval_samples_per_second": 255.761,
1448
  "eval_steps_per_second": 5.371,
1449
  "step": 2000
1450
+ },
1451
+ {
1452
+ "epoch": 0.3395843892549417,
1453
+ "grad_norm": 1.0625675916671753,
1454
+ "learning_rate": 0.0002999993805131495,
1455
+ "loss": 5.482983016967774,
1456
+ "step": 2010
1457
+ },
1458
+ {
1459
+ "epoch": 0.3412738638283494,
1460
+ "grad_norm": 0.9310702681541443,
1461
+ "learning_rate": 0.00029999723908369233,
1462
+ "loss": 5.477756500244141,
1463
+ "step": 2020
1464
+ },
1465
+ {
1466
+ "epoch": 0.3429633384017571,
1467
+ "grad_norm": 0.8275931477546692,
1468
+ "learning_rate": 0.0002999935680854744,
1469
+ "loss": 5.4467018127441404,
1470
+ "step": 2030
1471
+ },
1472
+ {
1473
+ "epoch": 0.3446528129751647,
1474
+ "grad_norm": 0.8972215056419373,
1475
+ "learning_rate": 0.00029998836755593,
1476
+ "loss": 5.415990829467773,
1477
+ "step": 2040
1478
+ },
1479
+ {
1480
+ "epoch": 0.3463422875485724,
1481
+ "grad_norm": 1.0727229118347168,
1482
+ "learning_rate": 0.00029998163754809044,
1483
+ "loss": 5.403407287597656,
1484
+ "step": 2050
1485
+ },
1486
+ {
1487
+ "epoch": 0.34803176212198006,
1488
+ "grad_norm": 1.0068520307540894,
1489
+ "learning_rate": 0.0002999733781305839,
1490
+ "loss": 5.4188987731933596,
1491
+ "step": 2060
1492
+ },
1493
+ {
1494
+ "epoch": 0.34972123669538774,
1495
+ "grad_norm": 0.9327341914176941,
1496
+ "learning_rate": 0.00029996358938763406,
1497
+ "loss": 5.406315612792969,
1498
+ "step": 2070
1499
+ },
1500
+ {
1501
+ "epoch": 0.3514107112687954,
1502
+ "grad_norm": 1.022828221321106,
1503
+ "learning_rate": 0.0002999522714190599,
1504
+ "loss": 5.410961532592774,
1505
+ "step": 2080
1506
+ },
1507
+ {
1508
+ "epoch": 0.3531001858422031,
1509
+ "grad_norm": 0.8379955887794495,
1510
+ "learning_rate": 0.0002999394243402743,
1511
+ "loss": 5.411350250244141,
1512
+ "step": 2090
1513
+ },
1514
+ {
1515
+ "epoch": 0.3547896604156107,
1516
+ "grad_norm": 0.8905497193336487,
1517
+ "learning_rate": 0.00029992504828228283,
1518
+ "loss": 5.384899520874024,
1519
+ "step": 2100
1520
+ },
1521
+ {
1522
+ "epoch": 0.3564791349890184,
1523
+ "grad_norm": 0.7869957685470581,
1524
+ "learning_rate": 0.00029990914339168286,
1525
+ "loss": 5.391331481933594,
1526
+ "step": 2110
1527
+ },
1528
+ {
1529
+ "epoch": 0.3581686095624261,
1530
+ "grad_norm": 0.7781967520713806,
1531
+ "learning_rate": 0.00029989170983066126,
1532
+ "loss": 5.365080261230469,
1533
+ "step": 2120
1534
+ },
1535
+ {
1536
+ "epoch": 0.35985808413583376,
1537
+ "grad_norm": 0.8611620664596558,
1538
+ "learning_rate": 0.0002998727477769937,
1539
+ "loss": 5.367116546630859,
1540
+ "step": 2130
1541
+ },
1542
+ {
1543
+ "epoch": 0.36154755870924143,
1544
+ "grad_norm": 0.8369846940040588,
1545
+ "learning_rate": 0.0002998522574240421,
1546
+ "loss": 5.361904525756836,
1547
+ "step": 2140
1548
+ },
1549
+ {
1550
+ "epoch": 0.3632370332826491,
1551
+ "grad_norm": 0.893395721912384,
1552
+ "learning_rate": 0.00029983023898075305,
1553
+ "loss": 5.338259887695313,
1554
+ "step": 2150
1555
+ },
1556
+ {
1557
+ "epoch": 0.3649265078560568,
1558
+ "grad_norm": 0.9806540012359619,
1559
+ "learning_rate": 0.00029980669267165545,
1560
+ "loss": 5.33393440246582,
1561
+ "step": 2160
1562
+ },
1563
+ {
1564
+ "epoch": 0.3666159824294644,
1565
+ "grad_norm": 0.789153516292572,
1566
+ "learning_rate": 0.0002997816187368584,
1567
+ "loss": 5.347314834594727,
1568
+ "step": 2170
1569
+ },
1570
+ {
1571
+ "epoch": 0.3683054570028721,
1572
+ "grad_norm": 0.731369137763977,
1573
+ "learning_rate": 0.00029975501743204866,
1574
+ "loss": 5.322664260864258,
1575
+ "step": 2180
1576
+ },
1577
+ {
1578
+ "epoch": 0.3699949315762798,
1579
+ "grad_norm": 0.6811886429786682,
1580
+ "learning_rate": 0.00029972688902848803,
1581
+ "loss": 5.326079177856445,
1582
+ "step": 2190
1583
+ },
1584
+ {
1585
+ "epoch": 0.37168440614968745,
1586
+ "grad_norm": 0.8143295645713806,
1587
+ "learning_rate": 0.0002996972338130106,
1588
+ "loss": 5.30379638671875,
1589
+ "step": 2200
1590
+ },
1591
+ {
1592
+ "epoch": 0.37337388072309513,
1593
+ "grad_norm": 0.8854978680610657,
1594
+ "learning_rate": 0.00029966605208801996,
1595
+ "loss": 5.301242828369141,
1596
+ "step": 2210
1597
+ },
1598
+ {
1599
+ "epoch": 0.3750633552965028,
1600
+ "grad_norm": 0.77631014585495,
1601
+ "learning_rate": 0.0002996333441714859,
1602
+ "loss": 5.294522476196289,
1603
+ "step": 2220
1604
+ },
1605
+ {
1606
+ "epoch": 0.37675282986991043,
1607
+ "grad_norm": 0.7743359208106995,
1608
+ "learning_rate": 0.00029959911039694127,
1609
+ "loss": 5.313030624389649,
1610
+ "step": 2230
1611
+ },
1612
+ {
1613
+ "epoch": 0.3784423044433181,
1614
+ "grad_norm": 0.8531479239463806,
1615
+ "learning_rate": 0.00029956335111347855,
1616
+ "loss": 5.275916671752929,
1617
+ "step": 2240
1618
+ },
1619
+ {
1620
+ "epoch": 0.3801317790167258,
1621
+ "grad_norm": 0.722363293170929,
1622
+ "learning_rate": 0.0002995260666857463,
1623
+ "loss": 5.2906639099121096,
1624
+ "step": 2250
1625
+ },
1626
+ {
1627
+ "epoch": 0.38182125359013347,
1628
+ "grad_norm": 0.7797225713729858,
1629
+ "learning_rate": 0.00029948725749394563,
1630
+ "loss": 5.2658641815185545,
1631
+ "step": 2260
1632
+ },
1633
+ {
1634
+ "epoch": 0.38351072816354115,
1635
+ "grad_norm": 0.8231165409088135,
1636
+ "learning_rate": 0.00029944692393382586,
1637
+ "loss": 5.2770263671875,
1638
+ "step": 2270
1639
+ },
1640
+ {
1641
+ "epoch": 0.3852002027369488,
1642
+ "grad_norm": 0.8083261847496033,
1643
+ "learning_rate": 0.000299405066416681,
1644
+ "loss": 5.277169799804687,
1645
+ "step": 2280
1646
+ },
1647
+ {
1648
+ "epoch": 0.3868896773103565,
1649
+ "grad_norm": 0.8675849437713623,
1650
+ "learning_rate": 0.0002993616853693452,
1651
+ "loss": 5.258210754394531,
1652
+ "step": 2290
1653
+ },
1654
+ {
1655
+ "epoch": 0.38857915188376413,
1656
+ "grad_norm": 0.7585932016372681,
1657
+ "learning_rate": 0.0002993167812341886,
1658
+ "loss": 5.252765655517578,
1659
+ "step": 2300
1660
+ },
1661
+ {
1662
+ "epoch": 0.3902686264571718,
1663
+ "grad_norm": 0.8213605284690857,
1664
+ "learning_rate": 0.0002992703544691127,
1665
+ "loss": 5.222419357299804,
1666
+ "step": 2310
1667
+ },
1668
+ {
1669
+ "epoch": 0.3919581010305795,
1670
+ "grad_norm": 0.7984234690666199,
1671
+ "learning_rate": 0.00029922240554754577,
1672
+ "loss": 5.227847671508789,
1673
+ "step": 2320
1674
+ },
1675
+ {
1676
+ "epoch": 0.39364757560398717,
1677
+ "grad_norm": 0.8216149806976318,
1678
+ "learning_rate": 0.00029917293495843793,
1679
+ "loss": 5.215268325805664,
1680
+ "step": 2330
1681
+ },
1682
+ {
1683
+ "epoch": 0.39533705017739484,
1684
+ "grad_norm": 0.7992113828659058,
1685
+ "learning_rate": 0.0002991219432062562,
1686
+ "loss": 5.251160049438477,
1687
+ "step": 2340
1688
+ },
1689
+ {
1690
+ "epoch": 0.3970265247508025,
1691
+ "grad_norm": 0.7669650316238403,
1692
+ "learning_rate": 0.0002990694308109795,
1693
+ "loss": 5.255714797973633,
1694
+ "step": 2350
1695
+ },
1696
+ {
1697
+ "epoch": 0.39871599932421015,
1698
+ "grad_norm": 0.7685340046882629,
1699
+ "learning_rate": 0.0002990153983080932,
1700
+ "loss": 5.2186332702636715,
1701
+ "step": 2360
1702
+ },
1703
+ {
1704
+ "epoch": 0.4004054738976178,
1705
+ "grad_norm": 0.8289806246757507,
1706
+ "learning_rate": 0.0002989598462485835,
1707
+ "loss": 5.2316020965576175,
1708
+ "step": 2370
1709
+ },
1710
+ {
1711
+ "epoch": 0.4020949484710255,
1712
+ "grad_norm": 0.7260857224464417,
1713
+ "learning_rate": 0.00029890277519893215,
1714
+ "loss": 5.210884857177734,
1715
+ "step": 2380
1716
+ },
1717
+ {
1718
+ "epoch": 0.4037844230444332,
1719
+ "grad_norm": 0.6450658440589905,
1720
+ "learning_rate": 0.0002988441857411106,
1721
+ "loss": 5.194115066528321,
1722
+ "step": 2390
1723
+ },
1724
+ {
1725
+ "epoch": 0.40547389761784086,
1726
+ "grad_norm": 0.723818838596344,
1727
+ "learning_rate": 0.0002987840784725737,
1728
+ "loss": 5.197711563110351,
1729
+ "step": 2400
1730
+ },
1731
+ {
1732
+ "epoch": 0.40716337219124854,
1733
+ "grad_norm": 0.8113153576850891,
1734
+ "learning_rate": 0.0002987224540062542,
1735
+ "loss": 5.196290588378906,
1736
+ "step": 2410
1737
+ },
1738
+ {
1739
+ "epoch": 0.4088528467646562,
1740
+ "grad_norm": 0.8224965929985046,
1741
+ "learning_rate": 0.00029865931297055605,
1742
+ "loss": 5.174480819702149,
1743
+ "step": 2420
1744
+ },
1745
+ {
1746
+ "epoch": 0.41054232133806384,
1747
+ "grad_norm": 0.9786369204521179,
1748
+ "learning_rate": 0.00029859465600934814,
1749
+ "loss": 5.19611701965332,
1750
+ "step": 2430
1751
+ },
1752
+ {
1753
+ "epoch": 0.4122317959114715,
1754
+ "grad_norm": 0.8020685911178589,
1755
+ "learning_rate": 0.0002985284837819577,
1756
+ "loss": 5.181368637084961,
1757
+ "step": 2440
1758
+ },
1759
+ {
1760
+ "epoch": 0.4139212704848792,
1761
+ "grad_norm": 0.7282792329788208,
1762
+ "learning_rate": 0.0002984607969631636,
1763
+ "loss": 5.1728168487548825,
1764
+ "step": 2450
1765
+ },
1766
+ {
1767
+ "epoch": 0.4156107450582869,
1768
+ "grad_norm": 0.6869542598724365,
1769
+ "learning_rate": 0.00029839159624318954,
1770
+ "loss": 5.172641372680664,
1771
+ "step": 2460
1772
+ },
1773
+ {
1774
+ "epoch": 0.41730021963169456,
1775
+ "grad_norm": 0.8235262632369995,
1776
+ "learning_rate": 0.00029832088232769694,
1777
+ "loss": 5.165771484375,
1778
+ "step": 2470
1779
+ },
1780
+ {
1781
+ "epoch": 0.41898969420510224,
1782
+ "grad_norm": 0.7626176476478577,
1783
+ "learning_rate": 0.0002982486559377776,
1784
+ "loss": 5.175928115844727,
1785
+ "step": 2480
1786
+ },
1787
+ {
1788
+ "epoch": 0.42067916877850986,
1789
+ "grad_norm": 0.636053740978241,
1790
+ "learning_rate": 0.0002981749178099467,
1791
+ "loss": 5.135253143310547,
1792
+ "step": 2490
1793
+ },
1794
+ {
1795
+ "epoch": 0.42236864335191754,
1796
+ "grad_norm": 0.6814470291137695,
1797
+ "learning_rate": 0.000298099668696135,
1798
+ "loss": 5.177354049682617,
1799
+ "step": 2500
1800
+ },
1801
+ {
1802
+ "epoch": 0.42236864335191754,
1803
+ "eval_loss": 5.138686656951904,
1804
+ "eval_runtime": 3.9981,
1805
+ "eval_samples_per_second": 250.119,
1806
+ "eval_steps_per_second": 5.253,
1807
+ "step": 2500
1808
  }
1809
  ],
1810
  "logging_steps": 10,
 
1824
  "attributes": {}
1825
  }
1826
  },
1827
+ "total_flos": 8.361420521472e+16,
1828
  "train_batch_size": 48,
1829
  "trial_name": null,
1830
  "trial_params": null