mtzig commited on
Commit
1075474
·
verified ·
1 Parent(s): 10eec54

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:534c42076a246c6da5a00ffd44149115e4d50f42ee2ee4186468f5798dbb1ccf
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ce02d94b7d924c632a2067f73c82bb18c67f7a726992c6269ace98bdc6f056
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a663829a3b941a4048ffcc2de6e0512c94c579c7a489c3e009cf30a2a53e694d
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fe0457fac8b753b7310b82e2980163e79f3ec63eda7fc5078528d806ac9366
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edb36318fb7a6485a66f873289f77615fb974210ae47a75c352e9d4d2d4426d8
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cdb0f1caf04a6ef7c87c522ede00a27f1e96ab25cd3d03dc1218ed3d6bf20b8
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6802368c5f12cbf6130a30d93da9380768ce9f37bbb6bd21b02b9e602182fbcd
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc82b46510200e245a640ac75a1015933d3bfdd29a9f14f7e6e3cf00746cc9f
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7a755e47d66d671add11d66f6099b1dd83a6c13121c2ef15fdfdde9a3177177
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f062cfd6da4a5849543ad9f156b79c7cb80350250de63ee7e3ea29c94d29be4
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb3edd1ccbdea3e3f2d56cd3a4646f38afe7ca93815da1414f65fe03b9b673a2
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9629d3c55606b00e24d749b100b32fb20097550ff7d500801506540c686e621
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4599060b53a4939dd8f840249e269e7830878980b2cf9fafb1b39f1203aaa960
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:102dc9d8e967de85cf93dfd57a3dcb2e8948d2b8aec650c2929692368e17358b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e02bfd751b50b769ac97d99b9a8385f2091de1188f94cbd07e5f93afeae257da
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a95ccc72a79fe5e1e19337f8b060654731a9cc11cc1a00aca76960a6644d8396
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51118da612b8171b6675abc2602bed7ce97edefe29a8f466ea28ed45a226a206
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5cb7c663bda618877a07c8634119b894f845c6c1f48ffc773fae5fc7587910b
3
  size 15088
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62710a98a3c7bb382ced086930b4b07cc7dc4c19e47a9f58b3464ec46167033a
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5facd7e65af3371a0f80215db840e12650b3c5facc3bc8e9c313b25307f1c5
3
  size 15088
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e4c5366934a63a65595c9de33e3c7b09bdf1751d64db6f76892cbdd781442b1
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1b0b8e0d8818ddd524ea79ea80b4c68befffe51c42d62588c2d5ea88da10a4b
3
  size 15088
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f747b50387c790da0638d4436a970217188c80f6b7d2d6cc099b8ddf28c5197c
3
  size 15088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a7feadabf89d4316a6005d255da1c51ac5f256268f862f7e85e39e78d9a3025
3
  size 15088
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a81a95c7d38a4c117734641266299d17605df7b45470c7b744f36bacf620813f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdc582f335d9f0cb32903e57fe4f3123895d2a8b97f7e3cac7005c8f96d98b1d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1694915254237288,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.75,
1540
  "eval_steps_per_second": 0.197,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.207084150810214e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2542372881355932,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.75,
1540
  "eval_steps_per_second": 0.197,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.17033898305084746,
1545
+ "grad_norm": 2.295042037963867,
1546
+ "learning_rate": 1.9700087871633267e-05,
1547
+ "loss": 0.0225,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.1711864406779661,
1552
+ "grad_norm": 1.9021865129470825,
1553
+ "learning_rate": 1.9692854983784235e-05,
1554
+ "loss": 0.0282,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.17203389830508475,
1559
+ "grad_norm": 6.971744060516357,
1560
+ "learning_rate": 1.9685537275184776e-05,
1561
+ "loss": 0.0743,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.17288135593220338,
1566
+ "grad_norm": 3.0627219676971436,
1567
+ "learning_rate": 1.9678134809871085e-05,
1568
+ "loss": 0.0305,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.17372881355932204,
1573
+ "grad_norm": 2.7916667461395264,
1574
+ "learning_rate": 1.9670647652621044e-05,
1575
+ "loss": 0.0407,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.17457627118644067,
1580
+ "grad_norm": 0.5132026076316833,
1581
+ "learning_rate": 1.966307586895367e-05,
1582
+ "loss": 0.0058,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.17542372881355933,
1587
+ "grad_norm": 2.076672077178955,
1588
+ "learning_rate": 1.9655419525128528e-05,
1589
+ "loss": 0.0275,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.17627118644067796,
1594
+ "grad_norm": 1.9575265645980835,
1595
+ "learning_rate": 1.9647678688145163e-05,
1596
+ "loss": 0.0178,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.17711864406779662,
1601
+ "grad_norm": 2.161008358001709,
1602
+ "learning_rate": 1.9639853425742496e-05,
1603
+ "loss": 0.0262,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.17796610169491525,
1608
+ "grad_norm": 2.026916742324829,
1609
+ "learning_rate": 1.963194380639825e-05,
1610
+ "loss": 0.0257,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.1788135593220339,
1615
+ "grad_norm": 1.033382773399353,
1616
+ "learning_rate": 1.9623949899328352e-05,
1617
+ "loss": 0.017,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.17966101694915254,
1622
+ "grad_norm": 1.5789072513580322,
1623
+ "learning_rate": 1.9615871774486293e-05,
1624
+ "loss": 0.0241,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.1805084745762712,
1629
+ "grad_norm": 1.8820958137512207,
1630
+ "learning_rate": 1.960770950256257e-05,
1631
+ "loss": 0.031,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.18135593220338983,
1636
+ "grad_norm": 3.501945734024048,
1637
+ "learning_rate": 1.959946315498402e-05,
1638
+ "loss": 0.0479,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.18220338983050846,
1643
+ "grad_norm": 1.3452684879302979,
1644
+ "learning_rate": 1.959113280391322e-05,
1645
+ "loss": 0.0158,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.18305084745762712,
1650
+ "grad_norm": 2.5982766151428223,
1651
+ "learning_rate": 1.9582718522247854e-05,
1652
+ "loss": 0.0331,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.18389830508474575,
1657
+ "grad_norm": 3.8872103691101074,
1658
+ "learning_rate": 1.9574220383620054e-05,
1659
+ "loss": 0.023,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.1847457627118644,
1664
+ "grad_norm": 1.6585066318511963,
1665
+ "learning_rate": 1.9565638462395796e-05,
1666
+ "loss": 0.0235,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.18559322033898304,
1671
+ "grad_norm": 1.910556435585022,
1672
+ "learning_rate": 1.95569728336742e-05,
1673
+ "loss": 0.0249,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.1864406779661017,
1678
+ "grad_norm": 2.956751823425293,
1679
+ "learning_rate": 1.954822357328692e-05,
1680
+ "loss": 0.0271,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.1864406779661017,
1685
+ "eval_accuracy": 0.9985815602836879,
1686
+ "eval_f1": 0.9974160206718347,
1687
+ "eval_loss": 0.007490006275475025,
1688
+ "eval_precision": 1.0,
1689
+ "eval_recall": 0.9948453608247423,
1690
+ "eval_runtime": 50.7062,
1691
+ "eval_samples_per_second": 5.759,
1692
+ "eval_steps_per_second": 0.197,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.18728813559322033,
1697
+ "grad_norm": 3.494168281555176,
1698
+ "learning_rate": 1.9539390757797444e-05,
1699
+ "loss": 0.0706,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.188135593220339,
1704
+ "grad_norm": 2.3897652626037598,
1705
+ "learning_rate": 1.9530474464500445e-05,
1706
+ "loss": 0.0252,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.18898305084745762,
1711
+ "grad_norm": 2.2590034008026123,
1712
+ "learning_rate": 1.9521474771421093e-05,
1713
+ "loss": 0.033,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.18983050847457628,
1718
+ "grad_norm": 3.1695659160614014,
1719
+ "learning_rate": 1.9512391757314382e-05,
1720
+ "loss": 0.0475,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.1906779661016949,
1725
+ "grad_norm": 0.6561757922172546,
1726
+ "learning_rate": 1.950322550166444e-05,
1727
+ "loss": 0.0089,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.19152542372881357,
1732
+ "grad_norm": 0.832501232624054,
1733
+ "learning_rate": 1.9493976084683814e-05,
1734
+ "loss": 0.0064,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.1923728813559322,
1739
+ "grad_norm": 2.210726022720337,
1740
+ "learning_rate": 1.94846435873128e-05,
1741
+ "loss": 0.0372,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.19322033898305085,
1746
+ "grad_norm": 1.805579662322998,
1747
+ "learning_rate": 1.9475228091218712e-05,
1748
+ "loss": 0.0288,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.19406779661016949,
1753
+ "grad_norm": 1.477453351020813,
1754
+ "learning_rate": 1.946572967879517e-05,
1755
+ "loss": 0.01,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.19491525423728814,
1760
+ "grad_norm": 1.16657555103302,
1761
+ "learning_rate": 1.9456148433161387e-05,
1762
+ "loss": 0.0183,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.19576271186440677,
1767
+ "grad_norm": 6.658266544342041,
1768
+ "learning_rate": 1.944648443816144e-05,
1769
+ "loss": 0.0464,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.19661016949152543,
1774
+ "grad_norm": 1.6691462993621826,
1775
+ "learning_rate": 1.9436737778363526e-05,
1776
+ "loss": 0.014,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.19745762711864406,
1781
+ "grad_norm": 1.3182052373886108,
1782
+ "learning_rate": 1.942690853905924e-05,
1783
+ "loss": 0.0092,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.19830508474576272,
1788
+ "grad_norm": 0.46388718485832214,
1789
+ "learning_rate": 1.94169968062628e-05,
1790
+ "loss": 0.004,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.19915254237288135,
1795
+ "grad_norm": 3.9312496185302734,
1796
+ "learning_rate": 1.9407002666710334e-05,
1797
+ "loss": 0.0451,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.2,
1802
+ "grad_norm": 1.5306397676467896,
1803
+ "learning_rate": 1.9396926207859085e-05,
1804
+ "loss": 0.0169,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.20084745762711864,
1809
+ "grad_norm": 1.4190640449523926,
1810
+ "learning_rate": 1.9386767517886666e-05,
1811
+ "loss": 0.0248,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.2016949152542373,
1816
+ "grad_norm": 2.2406575679779053,
1817
+ "learning_rate": 1.937652668569028e-05,
1818
+ "loss": 0.0259,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.20254237288135593,
1823
+ "grad_norm": 0.8504940867424011,
1824
+ "learning_rate": 1.9366203800885944e-05,
1825
+ "loss": 0.0076,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.2033898305084746,
1830
+ "grad_norm": 1.1296645402908325,
1831
+ "learning_rate": 1.9355798953807715e-05,
1832
+ "loss": 0.0071,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.2033898305084746,
1837
+ "eval_accuracy": 0.9985815602836879,
1838
+ "eval_f1": 0.9974160206718347,
1839
+ "eval_loss": 0.007261006161570549,
1840
+ "eval_precision": 1.0,
1841
+ "eval_recall": 0.9948453608247423,
1842
+ "eval_runtime": 50.5554,
1843
+ "eval_samples_per_second": 5.776,
1844
+ "eval_steps_per_second": 0.198,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.20423728813559322,
1849
+ "grad_norm": 2.6094491481781006,
1850
+ "learning_rate": 1.934531223550687e-05,
1851
+ "loss": 0.0298,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.20508474576271185,
1856
+ "grad_norm": 3.385869264602661,
1857
+ "learning_rate": 1.933474373775115e-05,
1858
+ "loss": 0.0282,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.2059322033898305,
1863
+ "grad_norm": 3.9192473888397217,
1864
+ "learning_rate": 1.932409355302392e-05,
1865
+ "loss": 0.0514,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.20677966101694914,
1870
+ "grad_norm": 2.312788724899292,
1871
+ "learning_rate": 1.9313361774523387e-05,
1872
+ "loss": 0.029,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.2076271186440678,
1877
+ "grad_norm": 1.603360891342163,
1878
+ "learning_rate": 1.9302548496161765e-05,
1879
+ "loss": 0.0193,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.20847457627118643,
1884
+ "grad_norm": 3.1534547805786133,
1885
+ "learning_rate": 1.9291653812564468e-05,
1886
+ "loss": 0.0516,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.2093220338983051,
1891
+ "grad_norm": 0.8444193005561829,
1892
+ "learning_rate": 1.9280677819069273e-05,
1893
+ "loss": 0.0084,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.21016949152542372,
1898
+ "grad_norm": 1.4498813152313232,
1899
+ "learning_rate": 1.926962061172548e-05,
1900
+ "loss": 0.012,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.21101694915254238,
1905
+ "grad_norm": 4.520916938781738,
1906
+ "learning_rate": 1.9258482287293097e-05,
1907
+ "loss": 0.0292,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.211864406779661,
1912
+ "grad_norm": 4.337013244628906,
1913
+ "learning_rate": 1.924726294324196e-05,
1914
+ "loss": 0.0285,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.21271186440677967,
1919
+ "grad_norm": 3.2843096256256104,
1920
+ "learning_rate": 1.9235962677750898e-05,
1921
+ "loss": 0.0424,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.2135593220338983,
1926
+ "grad_norm": 2.163046360015869,
1927
+ "learning_rate": 1.922458158970688e-05,
1928
+ "loss": 0.0235,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.21440677966101696,
1933
+ "grad_norm": 0.903529167175293,
1934
+ "learning_rate": 1.921311977870413e-05,
1935
+ "loss": 0.0111,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.21525423728813559,
1940
+ "grad_norm": 0.9559667110443115,
1941
+ "learning_rate": 1.9201577345043282e-05,
1942
+ "loss": 0.0097,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.21610169491525424,
1947
+ "grad_norm": 2.765850782394409,
1948
+ "learning_rate": 1.918995438973047e-05,
1949
+ "loss": 0.0173,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.21694915254237288,
1954
+ "grad_norm": 1.1641409397125244,
1955
+ "learning_rate": 1.917825101447647e-05,
1956
+ "loss": 0.0073,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.21779661016949153,
1961
+ "grad_norm": 1.1337021589279175,
1962
+ "learning_rate": 1.91664673216958e-05,
1963
+ "loss": 0.0129,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.21864406779661016,
1968
+ "grad_norm": 2.058060646057129,
1969
+ "learning_rate": 1.9154603414505825e-05,
1970
+ "loss": 0.0418,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.21949152542372882,
1975
+ "grad_norm": 2.029142379760742,
1976
+ "learning_rate": 1.9142659396725862e-05,
1977
+ "loss": 0.0289,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.22033898305084745,
1982
+ "grad_norm": 1.0298879146575928,
1983
+ "learning_rate": 1.9130635372876245e-05,
1984
+ "loss": 0.009,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.22033898305084745,
1989
+ "eval_accuracy": 1.0,
1990
+ "eval_f1": 1.0,
1991
+ "eval_loss": 0.002083389787003398,
1992
+ "eval_precision": 1.0,
1993
+ "eval_recall": 1.0,
1994
+ "eval_runtime": 50.0162,
1995
+ "eval_samples_per_second": 5.838,
1996
+ "eval_steps_per_second": 0.2,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.2211864406779661,
2001
+ "grad_norm": 1.038118839263916,
2002
+ "learning_rate": 1.9118531448177446e-05,
2003
+ "loss": 0.0065,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.22203389830508474,
2008
+ "grad_norm": 1.853901743888855,
2009
+ "learning_rate": 1.9106347728549134e-05,
2010
+ "loss": 0.0317,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.2228813559322034,
2015
+ "grad_norm": 1.197831392288208,
2016
+ "learning_rate": 1.909408432060925e-05,
2017
+ "loss": 0.0067,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.22372881355932203,
2022
+ "grad_norm": 1.2054872512817383,
2023
+ "learning_rate": 1.908174133167307e-05,
2024
+ "loss": 0.0105,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.2245762711864407,
2029
+ "grad_norm": 2.5519094467163086,
2030
+ "learning_rate": 1.906931886975228e-05,
2031
+ "loss": 0.0241,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.22542372881355932,
2036
+ "grad_norm": 1.8726714849472046,
2037
+ "learning_rate": 1.9056817043554024e-05,
2038
+ "loss": 0.0144,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.22627118644067798,
2043
+ "grad_norm": 3.216784715652466,
2044
+ "learning_rate": 1.9044235962479945e-05,
2045
+ "loss": 0.0338,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.2271186440677966,
2050
+ "grad_norm": 2.3170483112335205,
2051
+ "learning_rate": 1.903157573662524e-05,
2052
+ "loss": 0.0178,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.22796610169491524,
2057
+ "grad_norm": 3.330758571624756,
2058
+ "learning_rate": 1.9018836476777675e-05,
2059
+ "loss": 0.0404,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.2288135593220339,
2064
+ "grad_norm": 3.121005058288574,
2065
+ "learning_rate": 1.9006018294416648e-05,
2066
+ "loss": 0.0286,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.22966101694915253,
2071
+ "grad_norm": 0.9276627898216248,
2072
+ "learning_rate": 1.8993121301712194e-05,
2073
+ "loss": 0.0084,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.2305084745762712,
2078
+ "grad_norm": 0.7775290012359619,
2079
+ "learning_rate": 1.8980145611523996e-05,
2080
+ "loss": 0.0053,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.23135593220338982,
2085
+ "grad_norm": 3.8352859020233154,
2086
+ "learning_rate": 1.8967091337400418e-05,
2087
+ "loss": 0.0281,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.23220338983050848,
2092
+ "grad_norm": 1.0906392335891724,
2093
+ "learning_rate": 1.895395859357749e-05,
2094
+ "loss": 0.0128,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.2330508474576271,
2099
+ "grad_norm": 1.1791564226150513,
2100
+ "learning_rate": 1.894074749497793e-05,
2101
+ "loss": 0.0124,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.23389830508474577,
2106
+ "grad_norm": 1.4967304468154907,
2107
+ "learning_rate": 1.8927458157210125e-05,
2108
+ "loss": 0.015,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.2347457627118644,
2113
+ "grad_norm": 1.3809508085250854,
2114
+ "learning_rate": 1.8914090696567104e-05,
2115
+ "loss": 0.0089,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.23559322033898306,
2120
+ "grad_norm": 1.202942132949829,
2121
+ "learning_rate": 1.8900645230025566e-05,
2122
+ "loss": 0.0113,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.2364406779661017,
2127
+ "grad_norm": 1.3019424676895142,
2128
+ "learning_rate": 1.8887121875244804e-05,
2129
+ "loss": 0.008,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.23728813559322035,
2134
+ "grad_norm": 1.8381810188293457,
2135
+ "learning_rate": 1.8873520750565716e-05,
2136
+ "loss": 0.0288,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.23728813559322035,
2141
+ "eval_accuracy": 1.0,
2142
+ "eval_f1": 1.0,
2143
+ "eval_loss": 0.0014825169928371906,
2144
+ "eval_precision": 1.0,
2145
+ "eval_recall": 1.0,
2146
+ "eval_runtime": 49.7093,
2147
+ "eval_samples_per_second": 5.874,
2148
+ "eval_steps_per_second": 0.201,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.23813559322033898,
2153
+ "grad_norm": 0.7632168531417847,
2154
+ "learning_rate": 1.8859841975009747e-05,
2155
+ "loss": 0.0062,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.23898305084745763,
2160
+ "grad_norm": 2.5059711933135986,
2161
+ "learning_rate": 1.884608566827785e-05,
2162
+ "loss": 0.0304,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.23983050847457626,
2167
+ "grad_norm": 2.896942138671875,
2168
+ "learning_rate": 1.8832251950749443e-05,
2169
+ "loss": 0.0271,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.24067796610169492,
2174
+ "grad_norm": 2.438624620437622,
2175
+ "learning_rate": 1.8818340943481362e-05,
2176
+ "loss": 0.0236,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.24152542372881355,
2181
+ "grad_norm": 3.87087082862854,
2182
+ "learning_rate": 1.880435276820678e-05,
2183
+ "loss": 0.0337,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.2423728813559322,
2188
+ "grad_norm": 1.3031063079833984,
2189
+ "learning_rate": 1.8790287547334178e-05,
2190
+ "loss": 0.009,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.24322033898305084,
2195
+ "grad_norm": 2.244393825531006,
2196
+ "learning_rate": 1.8776145403946226e-05,
2197
+ "loss": 0.0331,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.2440677966101695,
2202
+ "grad_norm": 1.3984683752059937,
2203
+ "learning_rate": 1.8761926461798743e-05,
2204
+ "loss": 0.0095,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.24491525423728813,
2209
+ "grad_norm": 0.9854875206947327,
2210
+ "learning_rate": 1.874763084531961e-05,
2211
+ "loss": 0.0076,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.2457627118644068,
2216
+ "grad_norm": 1.9005022048950195,
2217
+ "learning_rate": 1.8733258679607674e-05,
2218
+ "loss": 0.0169,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.24661016949152542,
2223
+ "grad_norm": 2.174389600753784,
2224
+ "learning_rate": 1.871881009043163e-05,
2225
+ "loss": 0.0357,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.24745762711864408,
2230
+ "grad_norm": 3.0177578926086426,
2231
+ "learning_rate": 1.8704285204228973e-05,
2232
+ "loss": 0.0373,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.2483050847457627,
2237
+ "grad_norm": 2.5867350101470947,
2238
+ "learning_rate": 1.868968414810484e-05,
2239
+ "loss": 0.035,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.24915254237288137,
2244
+ "grad_norm": 1.4265027046203613,
2245
+ "learning_rate": 1.8675007049830937e-05,
2246
+ "loss": 0.0138,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.25,
2251
+ "grad_norm": 0.97405606508255,
2252
+ "learning_rate": 1.866025403784439e-05,
2253
+ "loss": 0.0102,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.25084745762711863,
2258
+ "grad_norm": 3.0567164421081543,
2259
+ "learning_rate": 1.8645425241246636e-05,
2260
+ "loss": 0.067,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.25169491525423726,
2265
+ "grad_norm": 1.839958667755127,
2266
+ "learning_rate": 1.8630520789802308e-05,
2267
+ "loss": 0.0252,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.25254237288135595,
2272
+ "grad_norm": 1.2445701360702515,
2273
+ "learning_rate": 1.8615540813938063e-05,
2274
+ "loss": 0.0391,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.2533898305084746,
2279
+ "grad_norm": 2.647850275039673,
2280
+ "learning_rate": 1.860048544474147e-05,
2281
+ "loss": 0.0212,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.2542372881355932,
2286
+ "grad_norm": 2.123107433319092,
2287
+ "learning_rate": 1.858535481395986e-05,
2288
+ "loss": 0.0236,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.2542372881355932,
2293
+ "eval_accuracy": 1.0,
2294
+ "eval_f1": 1.0,
2295
+ "eval_loss": 0.0011347213294357061,
2296
+ "eval_precision": 1.0,
2297
+ "eval_recall": 1.0,
2298
+ "eval_runtime": 52.457,
2299
+ "eval_samples_per_second": 5.566,
2300
+ "eval_steps_per_second": 0.191,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.289113333137408e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null