Ba2han commited on
Commit
0aeef93
·
verified ·
1 Parent(s): ba722f8

Training in progress, step 503, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:622ecd7be560bdbf226ecf8fd61144b01025d68163856053e09ea432de0b54f2
3
  size 1049614696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a8012a7e4ebbcf674e7c053aab2984371f5b66d18160874ef76e47df13d7d10
3
  size 1049614696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66eaacd2f49b512e2c61c4f12a4643bc826503e2993600f1ac775b0387e29469
3
  size 1372902609
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3767a23772d915d4f0d08aecbd8ba1007c2450b7f4264217c6dc062863bcbb62
3
  size 1372902609
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3642db2f13fe0c8bf25dc88e523139fe4e7db636a1b7146241a3216eeebaf086
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb0ed761d9206a280c8415a8884f5ff0e1548d6a1c2cc83a21b80bc00311e4e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0049751243781095,
6
  "eval_steps": 76,
7
- "global_step": 404,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1462,6 +1462,365 @@
1462
  "learning_rate": 4.4928312680573064e-05,
1463
  "loss": 1.4603819847106934,
1464
  "step": 404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1465
  }
1466
  ],
1467
  "logging_steps": 2,
@@ -1476,12 +1835,12 @@
1476
  "should_evaluate": false,
1477
  "should_log": false,
1478
  "should_save": true,
1479
- "should_training_stop": false
1480
  },
1481
  "attributes": {}
1482
  }
1483
  },
1484
- "total_flos": 9585726492508160.0,
1485
  "train_batch_size": 4,
1486
  "trial_name": null,
1487
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2512437810945274,
6
  "eval_steps": 76,
7
+ "global_step": 503,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1462
  "learning_rate": 4.4928312680573064e-05,
1463
  "loss": 1.4603819847106934,
1464
  "step": 404
1465
+ },
1466
+ {
1467
+ "epoch": 1.0099502487562189,
1468
+ "grad_norm": 0.7265625,
1469
+ "learning_rate": 4.415111107797445e-05,
1470
+ "loss": 1.4816609621047974,
1471
+ "step": 406
1472
+ },
1473
+ {
1474
+ "epoch": 1.0149253731343284,
1475
+ "grad_norm": 0.6640625,
1476
+ "learning_rate": 4.332629679574566e-05,
1477
+ "loss": 1.4633798599243164,
1478
+ "step": 408
1479
+ },
1480
+ {
1481
+ "epoch": 1.0199004975124377,
1482
+ "grad_norm": 0.75390625,
1483
+ "learning_rate": 4.245592045215182e-05,
1484
+ "loss": 1.4657684564590454,
1485
+ "step": 410
1486
+ },
1487
+ {
1488
+ "epoch": 1.0248756218905473,
1489
+ "grad_norm": 0.59765625,
1490
+ "learning_rate": 4.154214593992149e-05,
1491
+ "loss": 1.4662880897521973,
1492
+ "step": 412
1493
+ },
1494
+ {
1495
+ "epoch": 1.0298507462686568,
1496
+ "grad_norm": 0.8125,
1497
+ "learning_rate": 4.058724504646834e-05,
1498
+ "loss": 1.4156984090805054,
1499
+ "step": 414
1500
+ },
1501
+ {
1502
+ "epoch": 1.0348258706467661,
1503
+ "grad_norm": 0.73046875,
1504
+ "learning_rate": 3.959359180586975e-05,
1505
+ "loss": 1.4586745500564575,
1506
+ "step": 416
1507
+ },
1508
+ {
1509
+ "epoch": 1.0398009950248757,
1510
+ "grad_norm": 0.58984375,
1511
+ "learning_rate": 3.856365659664399e-05,
1512
+ "loss": 1.4008747339248657,
1513
+ "step": 418
1514
+ },
1515
+ {
1516
+ "epoch": 1.044776119402985,
1517
+ "grad_norm": 0.67578125,
1518
+ "learning_rate": 3.7500000000000003e-05,
1519
+ "loss": 1.5016967058181763,
1520
+ "step": 420
1521
+ },
1522
+ {
1523
+ "epoch": 1.0497512437810945,
1524
+ "grad_norm": 0.6875,
1525
+ "learning_rate": 3.6405266433829075e-05,
1526
+ "loss": 1.420767903327942,
1527
+ "step": 422
1528
+ },
1529
+ {
1530
+ "epoch": 1.054726368159204,
1531
+ "grad_norm": 0.796875,
1532
+ "learning_rate": 3.5282177578265296e-05,
1533
+ "loss": 1.4615557193756104,
1534
+ "step": 424
1535
+ },
1536
+ {
1537
+ "epoch": 1.0597014925373134,
1538
+ "grad_norm": 0.6875,
1539
+ "learning_rate": 3.413352560915988e-05,
1540
+ "loss": 1.426128625869751,
1541
+ "step": 426
1542
+ },
1543
+ {
1544
+ "epoch": 1.064676616915423,
1545
+ "grad_norm": 0.84765625,
1546
+ "learning_rate": 3.2962166256292113e-05,
1547
+ "loss": 1.4995248317718506,
1548
+ "step": 428
1549
+ },
1550
+ {
1551
+ "epoch": 1.0696517412935322,
1552
+ "grad_norm": 0.73046875,
1553
+ "learning_rate": 3.177101170357513e-05,
1554
+ "loss": 1.544938325881958,
1555
+ "step": 430
1556
+ },
1557
+ {
1558
+ "epoch": 1.0746268656716418,
1559
+ "grad_norm": 0.68359375,
1560
+ "learning_rate": 3.056302334890786e-05,
1561
+ "loss": 1.5545825958251953,
1562
+ "step": 432
1563
+ },
1564
+ {
1565
+ "epoch": 1.0796019900497513,
1566
+ "grad_norm": 0.6875,
1567
+ "learning_rate": 2.9341204441673266e-05,
1568
+ "loss": 1.4614660739898682,
1569
+ "step": 434
1570
+ },
1571
+ {
1572
+ "epoch": 1.0845771144278606,
1573
+ "grad_norm": 0.765625,
1574
+ "learning_rate": 2.8108592616187133e-05,
1575
+ "loss": 1.4706202745437622,
1576
+ "step": 436
1577
+ },
1578
+ {
1579
+ "epoch": 1.0895522388059702,
1580
+ "grad_norm": 0.66796875,
1581
+ "learning_rate": 2.686825233966061e-05,
1582
+ "loss": 1.4637281894683838,
1583
+ "step": 438
1584
+ },
1585
+ {
1586
+ "epoch": 1.0945273631840795,
1587
+ "grad_norm": 0.84375,
1588
+ "learning_rate": 2.5623267293451826e-05,
1589
+ "loss": 1.4879995584487915,
1590
+ "step": 440
1591
+ },
1592
+ {
1593
+ "epoch": 1.099502487562189,
1594
+ "grad_norm": 0.75390625,
1595
+ "learning_rate": 2.4376732706548183e-05,
1596
+ "loss": 1.5506470203399658,
1597
+ "step": 442
1598
+ },
1599
+ {
1600
+ "epoch": 1.1044776119402986,
1601
+ "grad_norm": 0.73828125,
1602
+ "learning_rate": 2.3131747660339394e-05,
1603
+ "loss": 1.5075286626815796,
1604
+ "step": 444
1605
+ },
1606
+ {
1607
+ "epoch": 1.109452736318408,
1608
+ "grad_norm": 0.9453125,
1609
+ "learning_rate": 2.189140738381288e-05,
1610
+ "loss": 1.4878989458084106,
1611
+ "step": 446
1612
+ },
1613
+ {
1614
+ "epoch": 1.1144278606965174,
1615
+ "grad_norm": 0.7421875,
1616
+ "learning_rate": 2.0658795558326743e-05,
1617
+ "loss": 1.4383412599563599,
1618
+ "step": 448
1619
+ },
1620
+ {
1621
+ "epoch": 1.1194029850746268,
1622
+ "grad_norm": 0.7265625,
1623
+ "learning_rate": 1.9436976651092144e-05,
1624
+ "loss": 1.5081899166107178,
1625
+ "step": 450
1626
+ },
1627
+ {
1628
+ "epoch": 1.1243781094527363,
1629
+ "grad_norm": 0.8203125,
1630
+ "learning_rate": 1.8228988296424877e-05,
1631
+ "loss": 1.497464656829834,
1632
+ "step": 452
1633
+ },
1634
+ {
1635
+ "epoch": 1.1293532338308458,
1636
+ "grad_norm": 0.80078125,
1637
+ "learning_rate": 1.7037833743707892e-05,
1638
+ "loss": 1.4927465915679932,
1639
+ "step": 454
1640
+ },
1641
+ {
1642
+ "epoch": 1.1343283582089552,
1643
+ "grad_norm": 0.7109375,
1644
+ "learning_rate": 1.5866474390840125e-05,
1645
+ "loss": 1.4225599765777588,
1646
+ "step": 456
1647
+ },
1648
+ {
1649
+ "epoch": 1.1343283582089552,
1650
+ "eval_loss": 1.460990071296692,
1651
+ "eval_runtime": 1.4591,
1652
+ "eval_samples_per_second": 89.097,
1653
+ "eval_steps_per_second": 11.651,
1654
+ "step": 456
1655
+ },
1656
+ {
1657
+ "epoch": 1.1393034825870647,
1658
+ "grad_norm": 0.7109375,
1659
+ "learning_rate": 1.4717822421734718e-05,
1660
+ "loss": 1.3992186784744263,
1661
+ "step": 458
1662
+ },
1663
+ {
1664
+ "epoch": 1.144278606965174,
1665
+ "grad_norm": 0.703125,
1666
+ "learning_rate": 1.3594733566170926e-05,
1667
+ "loss": 1.5498771667480469,
1668
+ "step": 460
1669
+ },
1670
+ {
1671
+ "epoch": 1.1492537313432836,
1672
+ "grad_norm": 0.64453125,
1673
+ "learning_rate": 1.2500000000000006e-05,
1674
+ "loss": 1.4473354816436768,
1675
+ "step": 462
1676
+ },
1677
+ {
1678
+ "epoch": 1.154228855721393,
1679
+ "grad_norm": 0.7578125,
1680
+ "learning_rate": 1.1436343403356017e-05,
1681
+ "loss": 1.4893980026245117,
1682
+ "step": 464
1683
+ },
1684
+ {
1685
+ "epoch": 1.1592039800995024,
1686
+ "grad_norm": 0.75390625,
1687
+ "learning_rate": 1.0406408194130259e-05,
1688
+ "loss": 1.4563506841659546,
1689
+ "step": 466
1690
+ },
1691
+ {
1692
+ "epoch": 1.164179104477612,
1693
+ "grad_norm": 0.70703125,
1694
+ "learning_rate": 9.412754953531663e-06,
1695
+ "loss": 1.4524166584014893,
1696
+ "step": 468
1697
+ },
1698
+ {
1699
+ "epoch": 1.1691542288557213,
1700
+ "grad_norm": 0.7265625,
1701
+ "learning_rate": 8.45785406007852e-06,
1702
+ "loss": 1.4781806468963623,
1703
+ "step": 470
1704
+ },
1705
+ {
1706
+ "epoch": 1.1741293532338308,
1707
+ "grad_norm": 0.69140625,
1708
+ "learning_rate": 7.5440795478481815e-06,
1709
+ "loss": 1.5440560579299927,
1710
+ "step": 472
1711
+ },
1712
+ {
1713
+ "epoch": 1.1791044776119404,
1714
+ "grad_norm": 0.83203125,
1715
+ "learning_rate": 6.673703204254347e-06,
1716
+ "loss": 1.4499876499176025,
1717
+ "step": 474
1718
+ },
1719
+ {
1720
+ "epoch": 1.1840796019900497,
1721
+ "grad_norm": 0.69140625,
1722
+ "learning_rate": 5.848888922025553e-06,
1723
+ "loss": 1.5100181102752686,
1724
+ "step": 476
1725
+ },
1726
+ {
1727
+ "epoch": 1.1890547263681592,
1728
+ "grad_norm": 0.765625,
1729
+ "learning_rate": 5.071687319426946e-06,
1730
+ "loss": 1.4939875602722168,
1731
+ "step": 478
1732
+ },
1733
+ {
1734
+ "epoch": 1.1940298507462686,
1735
+ "grad_norm": 0.6640625,
1736
+ "learning_rate": 4.344030642100133e-06,
1737
+ "loss": 1.5198827981948853,
1738
+ "step": 480
1739
+ },
1740
+ {
1741
+ "epoch": 1.199004975124378,
1742
+ "grad_norm": 0.66015625,
1743
+ "learning_rate": 3.66772795919611e-06,
1744
+ "loss": 1.490272879600525,
1745
+ "step": 482
1746
+ },
1747
+ {
1748
+ "epoch": 1.2039800995024876,
1749
+ "grad_norm": 0.765625,
1750
+ "learning_rate": 3.044460665744284e-06,
1751
+ "loss": 1.5192978382110596,
1752
+ "step": 484
1753
+ },
1754
+ {
1755
+ "epoch": 1.208955223880597,
1756
+ "grad_norm": 0.73046875,
1757
+ "learning_rate": 2.475778302439524e-06,
1758
+ "loss": 1.4486744403839111,
1759
+ "step": 486
1760
+ },
1761
+ {
1762
+ "epoch": 1.2139303482587065,
1763
+ "grad_norm": 0.6796875,
1764
+ "learning_rate": 1.9630947032398067e-06,
1765
+ "loss": 1.4685592651367188,
1766
+ "step": 488
1767
+ },
1768
+ {
1769
+ "epoch": 1.2189054726368158,
1770
+ "grad_norm": 0.71875,
1771
+ "learning_rate": 1.5076844803522922e-06,
1772
+ "loss": 1.3960230350494385,
1773
+ "step": 490
1774
+ },
1775
+ {
1776
+ "epoch": 1.2238805970149254,
1777
+ "grad_norm": 0.5859375,
1778
+ "learning_rate": 1.1106798553464804e-06,
1779
+ "loss": 1.4277310371398926,
1780
+ "step": 492
1781
+ },
1782
+ {
1783
+ "epoch": 1.228855721393035,
1784
+ "grad_norm": 0.66015625,
1785
+ "learning_rate": 7.730678442730538e-07,
1786
+ "loss": 1.5258288383483887,
1787
+ "step": 494
1788
+ },
1789
+ {
1790
+ "epoch": 1.2338308457711442,
1791
+ "grad_norm": 0.80859375,
1792
+ "learning_rate": 4.956878037864043e-07,
1793
+ "loss": 1.5295201539993286,
1794
+ "step": 496
1795
+ },
1796
+ {
1797
+ "epoch": 1.2388059701492538,
1798
+ "grad_norm": 0.7734375,
1799
+ "learning_rate": 2.7922934437178695e-07,
1800
+ "loss": 1.461329698562622,
1801
+ "step": 498
1802
+ },
1803
+ {
1804
+ "epoch": 1.243781094527363,
1805
+ "grad_norm": 0.72265625,
1806
+ "learning_rate": 1.2423061586496477e-07,
1807
+ "loss": 1.4992496967315674,
1808
+ "step": 500
1809
+ },
1810
+ {
1811
+ "epoch": 1.2487562189054726,
1812
+ "grad_norm": 0.78515625,
1813
+ "learning_rate": 3.107696952694139e-08,
1814
+ "loss": 1.4631916284561157,
1815
+ "step": 502
1816
+ },
1817
+ {
1818
+ "epoch": 1.2512437810945274,
1819
+ "eval_loss": 1.4604582786560059,
1820
+ "eval_runtime": 1.4388,
1821
+ "eval_samples_per_second": 90.352,
1822
+ "eval_steps_per_second": 11.815,
1823
+ "step": 503
1824
  }
1825
  ],
1826
  "logging_steps": 2,
 
1835
  "should_evaluate": false,
1836
  "should_log": false,
1837
  "should_save": true,
1838
+ "should_training_stop": true
1839
  },
1840
  "attributes": {}
1841
  }
1842
  },
1843
+ "total_flos": 1.1908427380948992e+16,
1844
  "train_batch_size": 4,
1845
  "trial_name": null,
1846
  "trial_params": null