Wilsonwin commited on
Commit
61e7635
·
verified ·
1 Parent(s): 8cf5ae0

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86977870d3df332c5c975d8b4f0d570e4557c1d4fd4364b77a5fac955fe62c58
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb64855cd768e7aa0f6f46d54fdfee34da708569cc56da521bb521f1101f672a
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2be2d6cc85c202403ae7a604b614b11bb028322263b6e955934cf8a2d4ef8092
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b80bf474cf78a89650cb9274c36cb3774d0d787508f786edf03b70cad49b53c
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a141ddada80b12146ad2875b480471ca4604a84a507446df6ce95668765adaf4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85fbbb21d9891a877eaefba7c40e5005f7303b4635375bbc6e0c808069fd11f
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a6e444c46ec49de792e4afbe9af4aa4613bca60425da2b0ac2cae225e516fcc
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5f5054d1fb89b5c064db193ff9ee8b30b57ffe17a11e00d28cfa91ea00081e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.33789491468153404,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1447,6 +1447,364 @@
1447
  "eval_samples_per_second": 270.591,
1448
  "eval_steps_per_second": 5.682,
1449
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  }
1451
  ],
1452
  "logging_steps": 10,
@@ -1466,7 +1824,7 @@
1466
  "attributes": {}
1467
  }
1468
  },
1469
- "total_flos": 6.6891364171776e+16,
1470
  "train_batch_size": 48,
1471
  "trial_name": null,
1472
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.42236864335191754,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1447
  "eval_samples_per_second": 270.591,
1448
  "eval_steps_per_second": 5.682,
1449
  "step": 2000
1450
+ },
1451
+ {
1452
+ "epoch": 0.3395843892549417,
1453
+ "grad_norm": 0.9857544302940369,
1454
+ "learning_rate": 0.0002999993805131495,
1455
+ "loss": 5.475189208984375,
1456
+ "step": 2010
1457
+ },
1458
+ {
1459
+ "epoch": 0.3412738638283494,
1460
+ "grad_norm": 0.8797745704650879,
1461
+ "learning_rate": 0.00029999723908369233,
1462
+ "loss": 5.471670150756836,
1463
+ "step": 2020
1464
+ },
1465
+ {
1466
+ "epoch": 0.3429633384017571,
1467
+ "grad_norm": 0.8447170257568359,
1468
+ "learning_rate": 0.0002999935680854744,
1469
+ "loss": 5.436410522460937,
1470
+ "step": 2030
1471
+ },
1472
+ {
1473
+ "epoch": 0.3446528129751647,
1474
+ "grad_norm": 0.87137770652771,
1475
+ "learning_rate": 0.00029998836755593,
1476
+ "loss": 5.40980339050293,
1477
+ "step": 2040
1478
+ },
1479
+ {
1480
+ "epoch": 0.3463422875485724,
1481
+ "grad_norm": 1.1437028646469116,
1482
+ "learning_rate": 0.00029998163754809044,
1483
+ "loss": 5.396385192871094,
1484
+ "step": 2050
1485
+ },
1486
+ {
1487
+ "epoch": 0.34803176212198006,
1488
+ "grad_norm": 0.7306678295135498,
1489
+ "learning_rate": 0.0002999733781305839,
1490
+ "loss": 5.408660888671875,
1491
+ "step": 2060
1492
+ },
1493
+ {
1494
+ "epoch": 0.34972123669538774,
1495
+ "grad_norm": 0.7920585870742798,
1496
+ "learning_rate": 0.00029996358938763406,
1497
+ "loss": 5.393876647949218,
1498
+ "step": 2070
1499
+ },
1500
+ {
1501
+ "epoch": 0.3514107112687954,
1502
+ "grad_norm": 0.8976436257362366,
1503
+ "learning_rate": 0.0002999522714190599,
1504
+ "loss": 5.399116897583008,
1505
+ "step": 2080
1506
+ },
1507
+ {
1508
+ "epoch": 0.3531001858422031,
1509
+ "grad_norm": 0.8801947832107544,
1510
+ "learning_rate": 0.0002999394243402743,
1511
+ "loss": 5.40356330871582,
1512
+ "step": 2090
1513
+ },
1514
+ {
1515
+ "epoch": 0.3547896604156107,
1516
+ "grad_norm": 0.9378096461296082,
1517
+ "learning_rate": 0.00029992504828228283,
1518
+ "loss": 5.37578010559082,
1519
+ "step": 2100
1520
+ },
1521
+ {
1522
+ "epoch": 0.3564791349890184,
1523
+ "grad_norm": 0.760123074054718,
1524
+ "learning_rate": 0.00029990914339168286,
1525
+ "loss": 5.383629989624024,
1526
+ "step": 2110
1527
+ },
1528
+ {
1529
+ "epoch": 0.3581686095624261,
1530
+ "grad_norm": 0.8094545006752014,
1531
+ "learning_rate": 0.00029989170983066126,
1532
+ "loss": 5.359802627563477,
1533
+ "step": 2120
1534
+ },
1535
+ {
1536
+ "epoch": 0.35985808413583376,
1537
+ "grad_norm": 0.9137438535690308,
1538
+ "learning_rate": 0.0002998727477769937,
1539
+ "loss": 5.361555862426758,
1540
+ "step": 2130
1541
+ },
1542
+ {
1543
+ "epoch": 0.36154755870924143,
1544
+ "grad_norm": 0.7989398241043091,
1545
+ "learning_rate": 0.0002998522574240421,
1546
+ "loss": 5.354197692871094,
1547
+ "step": 2140
1548
+ },
1549
+ {
1550
+ "epoch": 0.3632370332826491,
1551
+ "grad_norm": 0.8207266330718994,
1552
+ "learning_rate": 0.00029983023898075305,
1553
+ "loss": 5.331578063964844,
1554
+ "step": 2150
1555
+ },
1556
+ {
1557
+ "epoch": 0.3649265078560568,
1558
+ "grad_norm": 0.9194368124008179,
1559
+ "learning_rate": 0.00029980669267165545,
1560
+ "loss": 5.32526741027832,
1561
+ "step": 2160
1562
+ },
1563
+ {
1564
+ "epoch": 0.3666159824294644,
1565
+ "grad_norm": 0.80011385679245,
1566
+ "learning_rate": 0.0002997816187368584,
1567
+ "loss": 5.341778182983399,
1568
+ "step": 2170
1569
+ },
1570
+ {
1571
+ "epoch": 0.3683054570028721,
1572
+ "grad_norm": 0.7985261678695679,
1573
+ "learning_rate": 0.00029975501743204866,
1574
+ "loss": 5.31537094116211,
1575
+ "step": 2180
1576
+ },
1577
+ {
1578
+ "epoch": 0.3699949315762798,
1579
+ "grad_norm": 0.7046862244606018,
1580
+ "learning_rate": 0.00029972688902848803,
1581
+ "loss": 5.3185478210449215,
1582
+ "step": 2190
1583
+ },
1584
+ {
1585
+ "epoch": 0.37168440614968745,
1586
+ "grad_norm": 0.833369791507721,
1587
+ "learning_rate": 0.0002996972338130106,
1588
+ "loss": 5.297074890136718,
1589
+ "step": 2200
1590
+ },
1591
+ {
1592
+ "epoch": 0.37337388072309513,
1593
+ "grad_norm": 0.9138798117637634,
1594
+ "learning_rate": 0.00029966605208801996,
1595
+ "loss": 5.29405403137207,
1596
+ "step": 2210
1597
+ },
1598
+ {
1599
+ "epoch": 0.3750633552965028,
1600
+ "grad_norm": 0.8588988780975342,
1601
+ "learning_rate": 0.0002996333441714859,
1602
+ "loss": 5.285926437377929,
1603
+ "step": 2220
1604
+ },
1605
+ {
1606
+ "epoch": 0.37675282986991043,
1607
+ "grad_norm": 0.7140660285949707,
1608
+ "learning_rate": 0.00029959911039694127,
1609
+ "loss": 5.305549621582031,
1610
+ "step": 2230
1611
+ },
1612
+ {
1613
+ "epoch": 0.3784423044433181,
1614
+ "grad_norm": 0.7165802717208862,
1615
+ "learning_rate": 0.00029956335111347855,
1616
+ "loss": 5.268837356567383,
1617
+ "step": 2240
1618
+ },
1619
+ {
1620
+ "epoch": 0.3801317790167258,
1621
+ "grad_norm": 0.8172394037246704,
1622
+ "learning_rate": 0.0002995260666857463,
1623
+ "loss": 5.283020782470703,
1624
+ "step": 2250
1625
+ },
1626
+ {
1627
+ "epoch": 0.38182125359013347,
1628
+ "grad_norm": 0.7977796792984009,
1629
+ "learning_rate": 0.00029948725749394563,
1630
+ "loss": 5.262269973754883,
1631
+ "step": 2260
1632
+ },
1633
+ {
1634
+ "epoch": 0.38351072816354115,
1635
+ "grad_norm": 0.7707539200782776,
1636
+ "learning_rate": 0.00029944692393382586,
1637
+ "loss": 5.270823669433594,
1638
+ "step": 2270
1639
+ },
1640
+ {
1641
+ "epoch": 0.3852002027369488,
1642
+ "grad_norm": 0.76548171043396,
1643
+ "learning_rate": 0.000299405066416681,
1644
+ "loss": 5.270006942749023,
1645
+ "step": 2280
1646
+ },
1647
+ {
1648
+ "epoch": 0.3868896773103565,
1649
+ "grad_norm": 0.8181013464927673,
1650
+ "learning_rate": 0.0002993616853693452,
1651
+ "loss": 5.2521240234375,
1652
+ "step": 2290
1653
+ },
1654
+ {
1655
+ "epoch": 0.38857915188376413,
1656
+ "grad_norm": 0.6938267350196838,
1657
+ "learning_rate": 0.0002993167812341886,
1658
+ "loss": 5.245725631713867,
1659
+ "step": 2300
1660
+ },
1661
+ {
1662
+ "epoch": 0.3902686264571718,
1663
+ "grad_norm": 0.7547310590744019,
1664
+ "learning_rate": 0.0002992703544691127,
1665
+ "loss": 5.216205596923828,
1666
+ "step": 2310
1667
+ },
1668
+ {
1669
+ "epoch": 0.3919581010305795,
1670
+ "grad_norm": 0.8312097787857056,
1671
+ "learning_rate": 0.00029922240554754577,
1672
+ "loss": 5.221836090087891,
1673
+ "step": 2320
1674
+ },
1675
+ {
1676
+ "epoch": 0.39364757560398717,
1677
+ "grad_norm": 0.8383576273918152,
1678
+ "learning_rate": 0.00029917293495843793,
1679
+ "loss": 5.21032485961914,
1680
+ "step": 2330
1681
+ },
1682
+ {
1683
+ "epoch": 0.39533705017739484,
1684
+ "grad_norm": 0.6876690983772278,
1685
+ "learning_rate": 0.0002991219432062562,
1686
+ "loss": 5.247097778320312,
1687
+ "step": 2340
1688
+ },
1689
+ {
1690
+ "epoch": 0.3970265247508025,
1691
+ "grad_norm": 0.7683764696121216,
1692
+ "learning_rate": 0.0002990694308109795,
1693
+ "loss": 5.248017883300781,
1694
+ "step": 2350
1695
+ },
1696
+ {
1697
+ "epoch": 0.39871599932421015,
1698
+ "grad_norm": 0.7274552583694458,
1699
+ "learning_rate": 0.0002990153983080932,
1700
+ "loss": 5.210857009887695,
1701
+ "step": 2360
1702
+ },
1703
+ {
1704
+ "epoch": 0.4004054738976178,
1705
+ "grad_norm": 0.7037548422813416,
1706
+ "learning_rate": 0.0002989598462485835,
1707
+ "loss": 5.223086929321289,
1708
+ "step": 2370
1709
+ },
1710
+ {
1711
+ "epoch": 0.4020949484710255,
1712
+ "grad_norm": 0.7380816340446472,
1713
+ "learning_rate": 0.00029890277519893215,
1714
+ "loss": 5.203308486938477,
1715
+ "step": 2380
1716
+ },
1717
+ {
1718
+ "epoch": 0.4037844230444332,
1719
+ "grad_norm": 0.6980042457580566,
1720
+ "learning_rate": 0.0002988441857411106,
1721
+ "loss": 5.187635803222657,
1722
+ "step": 2390
1723
+ },
1724
+ {
1725
+ "epoch": 0.40547389761784086,
1726
+ "grad_norm": 0.7107545137405396,
1727
+ "learning_rate": 0.0002987840784725737,
1728
+ "loss": 5.192476654052735,
1729
+ "step": 2400
1730
+ },
1731
+ {
1732
+ "epoch": 0.40716337219124854,
1733
+ "grad_norm": 0.7168161869049072,
1734
+ "learning_rate": 0.0002987224540062542,
1735
+ "loss": 5.191188812255859,
1736
+ "step": 2410
1737
+ },
1738
+ {
1739
+ "epoch": 0.4088528467646562,
1740
+ "grad_norm": 0.8272731900215149,
1741
+ "learning_rate": 0.00029865931297055605,
1742
+ "loss": 5.169822692871094,
1743
+ "step": 2420
1744
+ },
1745
+ {
1746
+ "epoch": 0.41054232133806384,
1747
+ "grad_norm": 0.8275768160820007,
1748
+ "learning_rate": 0.00029859465600934814,
1749
+ "loss": 5.191945266723633,
1750
+ "step": 2430
1751
+ },
1752
+ {
1753
+ "epoch": 0.4122317959114715,
1754
+ "grad_norm": 0.7465378642082214,
1755
+ "learning_rate": 0.0002985284837819577,
1756
+ "loss": 5.1748401641845705,
1757
+ "step": 2440
1758
+ },
1759
+ {
1760
+ "epoch": 0.4139212704848792,
1761
+ "grad_norm": 0.7874022126197815,
1762
+ "learning_rate": 0.0002984607969631636,
1763
+ "loss": 5.167286682128906,
1764
+ "step": 2450
1765
+ },
1766
+ {
1767
+ "epoch": 0.4156107450582869,
1768
+ "grad_norm": 0.7480391263961792,
1769
+ "learning_rate": 0.00029839159624318954,
1770
+ "loss": 5.167938232421875,
1771
+ "step": 2460
1772
+ },
1773
+ {
1774
+ "epoch": 0.41730021963169456,
1775
+ "grad_norm": 0.7812421917915344,
1776
+ "learning_rate": 0.00029832088232769694,
1777
+ "loss": 5.159024810791015,
1778
+ "step": 2470
1779
+ },
1780
+ {
1781
+ "epoch": 0.41898969420510224,
1782
+ "grad_norm": 0.7070655226707458,
1783
+ "learning_rate": 0.0002982486559377776,
1784
+ "loss": 5.166957092285156,
1785
+ "step": 2480
1786
+ },
1787
+ {
1788
+ "epoch": 0.42067916877850986,
1789
+ "grad_norm": 0.6980714797973633,
1790
+ "learning_rate": 0.0002981749178099467,
1791
+ "loss": 5.127694702148437,
1792
+ "step": 2490
1793
+ },
1794
+ {
1795
+ "epoch": 0.42236864335191754,
1796
+ "grad_norm": 0.661147952079773,
1797
+ "learning_rate": 0.000298099668696135,
1798
+ "loss": 5.172074890136718,
1799
+ "step": 2500
1800
+ },
1801
+ {
1802
+ "epoch": 0.42236864335191754,
1803
+ "eval_loss": 5.1130194664001465,
1804
+ "eval_runtime": 3.6713,
1805
+ "eval_samples_per_second": 272.384,
1806
+ "eval_steps_per_second": 5.72,
1807
+ "step": 2500
1808
  }
1809
  ],
1810
  "logging_steps": 10,
 
1824
  "attributes": {}
1825
  }
1826
  },
1827
+ "total_flos": 8.361420521472e+16,
1828
  "train_batch_size": 48,
1829
  "trial_name": null,
1830
  "trial_params": null