mtzig commited on
Commit
dcec137
·
verified ·
1 Parent(s): f5a295e

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce35a7d9d3ee41577a6667581545907c05369b98efeb251f2496e6bc41c8ec77
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c6cd9bbf69ca7d063802c57c85eae4c8a4594c67d7465d15bea57713fe47513
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3183b1742cfca72c52d940d8e04287494b43efa4116947a96e4cccfcd21348d1
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ccc731a20bfee6a4b55db5e68d3cde38d8ac7d2f719abd77539294a3e34b2ec
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4205a4ab8bd014921ab915be98db9b55bb90c27eea063f468f810bebf254273d
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2785839db765cda31082ddf8a82d30deab7dd42debaf778105e6f1e72628bf0b
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27c49502b98af6483397efb3fb254c6f7e946e966f58d1d19162f8d43a197fae
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6543635366541dceae1dd968c306b2f0f2b4ecbd81d3aea258b1607da46815a7
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b45efd5b804d9c79be3d4548ed087b9258b26177b6f16e8676684fc7e504f116
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b475c353783d4026335e7030c57ba069d3d2acec25effee2905e6c973933656
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f06d05203698d9a4d1d043d4f6ec8e5d78d608cb2c2042bf829842852ccf38a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722e06e8226ee81e6a1d319d49c78ac25f3e3038d6458b00cb35e8a42e093c2b
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9238ae75b55c27a76fd44d1a52af2ef5fcef2e2d365994a5ae17e1a8621203d8
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2282b29b361aca76f71e45f53ef426769df72fecd3cfe3e9040dd5f970c053c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9f463169e3e2cc274a980569fa1cb4cfa88e7201ab5723d1c28049cdf5ad735
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6c20f9a7c898a644f42332beb081e2ffc36926b9b070c86a030d2234848a7f
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5fd10842b846f23f804b87787b0db7af5bfcba064be8c3070f885069f8f09eb
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9417da51974d70b68528d0d38ed2aa065576da18992311f443f57eb70377b35a
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2450d0f517cf62b4f3a015159fe38db28367eb0c801cb5225a1b0f787d5dab99
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29eb939b26d6f551db8d7074c3cce05769523646edbde94bdcfcf35d61bc8110
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e779a0b2c2a3ad985f3f55d1ce49fd69594728e960e944e220a1338fd43bc335
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584643f2a79df1538d3f949cf4a835bec142b64a553256df7939bcbd22be0239
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b87d7520e5c4522a68dbd37ed2479be5e1a14db81e2ef489ecd23f9218d190e0
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cf12f3a30d07de0068e40183f8060da3680a455554f62dd104c0e905fb0bf5
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e4d36ee848a393c30e3e5c4fa3aa77d375c6146cf30c4d23f89f99b1beaf537
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcc4feba00073f29deb4ad23d7b496e19059f229ff63aac62d469be206d14266
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.18552875695732837,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.569,
1540
  "eval_steps_per_second": 0.182,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.403110712901632e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2782931354359926,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.569,
1540
  "eval_steps_per_second": 0.182,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.18645640074211503,
1545
+ "grad_norm": 3.4353301525115967,
1546
+ "learning_rate": 1.9549799180279793e-05,
1547
+ "loss": 0.2293,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.18738404452690166,
1552
+ "grad_norm": 3.4561715126037598,
1553
+ "learning_rate": 1.9540140680664915e-05,
1554
+ "loss": 0.1131,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.18831168831168832,
1559
+ "grad_norm": 4.9482293128967285,
1560
+ "learning_rate": 1.953038210948861e-05,
1561
+ "loss": 0.2009,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.18923933209647495,
1566
+ "grad_norm": 4.263972759246826,
1567
+ "learning_rate": 1.952052356911368e-05,
1568
+ "loss": 0.2624,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.1901669758812616,
1573
+ "grad_norm": 4.766571044921875,
1574
+ "learning_rate": 1.9510565162951538e-05,
1575
+ "loss": 0.1823,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.19109461966604824,
1580
+ "grad_norm": 5.494351387023926,
1581
+ "learning_rate": 1.950050699546116e-05,
1582
+ "loss": 0.2365,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.19202226345083487,
1587
+ "grad_norm": 5.0484795570373535,
1588
+ "learning_rate": 1.9490349172147964e-05,
1589
+ "loss": 0.2197,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.19294990723562153,
1594
+ "grad_norm": 4.198167324066162,
1595
+ "learning_rate": 1.9480091799562706e-05,
1596
+ "loss": 0.1825,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.19387755102040816,
1601
+ "grad_norm": 4.7388105392456055,
1602
+ "learning_rate": 1.9469734985300373e-05,
1603
+ "loss": 0.2195,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.19480519480519481,
1608
+ "grad_norm": 4.556212902069092,
1609
+ "learning_rate": 1.9459278837999048e-05,
1610
+ "loss": 0.2085,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.19573283858998144,
1615
+ "grad_norm": 5.153113842010498,
1616
+ "learning_rate": 1.9448723467338765e-05,
1617
+ "loss": 0.2332,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.19666048237476808,
1622
+ "grad_norm": 2.6720712184906006,
1623
+ "learning_rate": 1.9438068984040366e-05,
1624
+ "loss": 0.1761,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.19758812615955473,
1629
+ "grad_norm": 2.9613466262817383,
1630
+ "learning_rate": 1.9427315499864345e-05,
1631
+ "loss": 0.1877,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.19851576994434136,
1636
+ "grad_norm": 3.5731585025787354,
1637
+ "learning_rate": 1.9416463127609655e-05,
1638
+ "loss": 0.2018,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.19944341372912802,
1643
+ "grad_norm": 4.205770015716553,
1644
+ "learning_rate": 1.9405511981112553e-05,
1645
+ "loss": 0.1952,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.20037105751391465,
1650
+ "grad_norm": 4.905291557312012,
1651
+ "learning_rate": 1.9394462175245382e-05,
1652
+ "loss": 0.2482,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.2012987012987013,
1657
+ "grad_norm": 6.575018405914307,
1658
+ "learning_rate": 1.9383313825915372e-05,
1659
+ "loss": 0.2007,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.20222634508348794,
1664
+ "grad_norm": 4.795684814453125,
1665
+ "learning_rate": 1.937206705006344e-05,
1666
+ "loss": 0.1878,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.20315398886827457,
1671
+ "grad_norm": 3.1328468322753906,
1672
+ "learning_rate": 1.9360721965662934e-05,
1673
+ "loss": 0.1864,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.20408163265306123,
1678
+ "grad_norm": 3.297593355178833,
1679
+ "learning_rate": 1.9349278691718426e-05,
1680
+ "loss": 0.2033,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.20408163265306123,
1685
+ "eval_accuracy": 0.8370288248337029,
1686
+ "eval_f1": 0.6423357664233577,
1687
+ "eval_loss": 0.34552833437919617,
1688
+ "eval_precision": 0.8354430379746836,
1689
+ "eval_recall": 0.5217391304347826,
1690
+ "eval_runtime": 48.9715,
1691
+ "eval_samples_per_second": 5.636,
1692
+ "eval_steps_per_second": 0.184,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.20500927643784786,
1697
+ "grad_norm": 3.253046989440918,
1698
+ "learning_rate": 1.9337737348264448e-05,
1699
+ "loss": 0.2099,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.20593692022263452,
1704
+ "grad_norm": 4.903271675109863,
1705
+ "learning_rate": 1.9326098056364224e-05,
1706
+ "loss": 0.2595,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.20686456400742115,
1711
+ "grad_norm": 6.408766746520996,
1712
+ "learning_rate": 1.9314360938108427e-05,
1713
+ "loss": 0.2615,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.2077922077922078,
1718
+ "grad_norm": 4.407992839813232,
1719
+ "learning_rate": 1.9302526116613863e-05,
1720
+ "loss": 0.1009,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.20871985157699444,
1725
+ "grad_norm": 2.9286835193634033,
1726
+ "learning_rate": 1.9290593716022218e-05,
1727
+ "loss": 0.1412,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.20964749536178107,
1732
+ "grad_norm": 3.6685068607330322,
1733
+ "learning_rate": 1.9278563861498726e-05,
1734
+ "loss": 0.221,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.21057513914656772,
1739
+ "grad_norm": 4.300150394439697,
1740
+ "learning_rate": 1.9266436679230866e-05,
1741
+ "loss": 0.1645,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.21150278293135436,
1746
+ "grad_norm": 4.307882308959961,
1747
+ "learning_rate": 1.9254212296427043e-05,
1748
+ "loss": 0.2284,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.212430426716141,
1753
+ "grad_norm": 2.4391989707946777,
1754
+ "learning_rate": 1.924189084131525e-05,
1755
+ "loss": 0.118,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.21335807050092764,
1760
+ "grad_norm": 5.997758388519287,
1761
+ "learning_rate": 1.922947244314172e-05,
1762
+ "loss": 0.2798,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.21428571428571427,
1767
+ "grad_norm": 4.957704544067383,
1768
+ "learning_rate": 1.9216957232169567e-05,
1769
+ "loss": 0.1852,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.21521335807050093,
1774
+ "grad_norm": 5.667599678039551,
1775
+ "learning_rate": 1.9204345339677442e-05,
1776
+ "loss": 0.2155,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.21614100185528756,
1781
+ "grad_norm": 4.291685104370117,
1782
+ "learning_rate": 1.9191636897958123e-05,
1783
+ "loss": 0.2637,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.21706864564007422,
1788
+ "grad_norm": 4.587561130523682,
1789
+ "learning_rate": 1.9178832040317153e-05,
1790
+ "loss": 0.2383,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.21799628942486085,
1795
+ "grad_norm": 3.8426513671875,
1796
+ "learning_rate": 1.916593090107143e-05,
1797
+ "loss": 0.212,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.2189239332096475,
1802
+ "grad_norm": 5.173671722412109,
1803
+ "learning_rate": 1.91529336155478e-05,
1804
+ "loss": 0.2054,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.21985157699443414,
1809
+ "grad_norm": 4.827680587768555,
1810
+ "learning_rate": 1.913984032008163e-05,
1811
+ "loss": 0.2072,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.22077922077922077,
1816
+ "grad_norm": 7.693399906158447,
1817
+ "learning_rate": 1.9126651152015404e-05,
1818
+ "loss": 0.2071,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.22170686456400743,
1823
+ "grad_norm": 3.128953695297241,
1824
+ "learning_rate": 1.911336624969725e-05,
1825
+ "loss": 0.2271,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.22263450834879406,
1830
+ "grad_norm": 6.966447353363037,
1831
+ "learning_rate": 1.9099985752479505e-05,
1832
+ "loss": 0.2448,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.22263450834879406,
1837
+ "eval_accuracy": 0.8381374722838137,
1838
+ "eval_f1": 0.6403940886699507,
1839
+ "eval_loss": 0.34381967782974243,
1840
+ "eval_precision": 0.8496732026143791,
1841
+ "eval_recall": 0.5138339920948617,
1842
+ "eval_runtime": 48.6504,
1843
+ "eval_samples_per_second": 5.673,
1844
+ "eval_steps_per_second": 0.185,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.22356215213358072,
1849
+ "grad_norm": 4.817497730255127,
1850
+ "learning_rate": 1.908650980071726e-05,
1851
+ "loss": 0.1878,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.22448979591836735,
1856
+ "grad_norm": 3.612339973449707,
1857
+ "learning_rate": 1.9072938535766864e-05,
1858
+ "loss": 0.1226,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.22541743970315398,
1863
+ "grad_norm": 4.005315780639648,
1864
+ "learning_rate": 1.905927209998447e-05,
1865
+ "loss": 0.2076,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.22634508348794063,
1870
+ "grad_norm": 3.032423496246338,
1871
+ "learning_rate": 1.904551063672452e-05,
1872
+ "loss": 0.1511,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.22727272727272727,
1877
+ "grad_norm": 3.060365676879883,
1878
+ "learning_rate": 1.9031654290338256e-05,
1879
+ "loss": 0.1954,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.22820037105751392,
1884
+ "grad_norm": 5.271503448486328,
1885
+ "learning_rate": 1.9017703206172187e-05,
1886
+ "loss": 0.2244,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.22912801484230055,
1891
+ "grad_norm": 2.923628568649292,
1892
+ "learning_rate": 1.900365753056659e-05,
1893
+ "loss": 0.1753,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.2300556586270872,
1898
+ "grad_norm": 3.745664119720459,
1899
+ "learning_rate": 1.8989517410853956e-05,
1900
+ "loss": 0.188,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.23098330241187384,
1905
+ "grad_norm": 3.7633256912231445,
1906
+ "learning_rate": 1.8975282995357448e-05,
1907
+ "loss": 0.201,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.23191094619666047,
1912
+ "grad_norm": 3.29656720161438,
1913
+ "learning_rate": 1.896095443338935e-05,
1914
+ "loss": 0.1956,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.23283858998144713,
1919
+ "grad_norm": 4.702951431274414,
1920
+ "learning_rate": 1.8946531875249496e-05,
1921
+ "loss": 0.2101,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.23376623376623376,
1926
+ "grad_norm": 7.016535758972168,
1927
+ "learning_rate": 1.8932015472223692e-05,
1928
+ "loss": 0.2486,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.23469387755102042,
1933
+ "grad_norm": 4.004672050476074,
1934
+ "learning_rate": 1.8917405376582144e-05,
1935
+ "loss": 0.239,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.23562152133580705,
1940
+ "grad_norm": 4.382692337036133,
1941
+ "learning_rate": 1.8902701741577844e-05,
1942
+ "loss": 0.2308,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.23654916512059368,
1947
+ "grad_norm": 4.72487735748291,
1948
+ "learning_rate": 1.8887904721444955e-05,
1949
+ "loss": 0.2098,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.23747680890538034,
1954
+ "grad_norm": 3.4035651683807373,
1955
+ "learning_rate": 1.8873014471397225e-05,
1956
+ "loss": 0.101,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.23840445269016697,
1961
+ "grad_norm": 7.383582592010498,
1962
+ "learning_rate": 1.8858031147626326e-05,
1963
+ "loss": 0.2499,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.23933209647495363,
1968
+ "grad_norm": 4.3388895988464355,
1969
+ "learning_rate": 1.8842954907300236e-05,
1970
+ "loss": 0.1859,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.24025974025974026,
1975
+ "grad_norm": 3.2572548389434814,
1976
+ "learning_rate": 1.8827785908561585e-05,
1977
+ "loss": 0.1833,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.24118738404452691,
1982
+ "grad_norm": 7.2369561195373535,
1983
+ "learning_rate": 1.881252431052599e-05,
1984
+ "loss": 0.2337,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.24118738404452691,
1989
+ "eval_accuracy": 0.8281596452328159,
1990
+ "eval_f1": 0.5931758530183727,
1991
+ "eval_loss": 0.3704891502857208,
1992
+ "eval_precision": 0.8828125,
1993
+ "eval_recall": 0.44664031620553357,
1994
+ "eval_runtime": 47.7821,
1995
+ "eval_samples_per_second": 5.776,
1996
+ "eval_steps_per_second": 0.188,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.24211502782931354,
2001
+ "grad_norm": 6.052790641784668,
2002
+ "learning_rate": 1.879717027328039e-05,
2003
+ "loss": 0.2569,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.24304267161410018,
2008
+ "grad_norm": 3.246156692504883,
2009
+ "learning_rate": 1.8781723957881374e-05,
2010
+ "loss": 0.1871,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.24397031539888683,
2015
+ "grad_norm": 6.536667823791504,
2016
+ "learning_rate": 1.876618552635348e-05,
2017
+ "loss": 0.1425,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.24489795918367346,
2022
+ "grad_norm": 5.4920830726623535,
2023
+ "learning_rate": 1.87505551416875e-05,
2024
+ "loss": 0.2275,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.24582560296846012,
2029
+ "grad_norm": 7.879229545593262,
2030
+ "learning_rate": 1.8734832967838775e-05,
2031
+ "loss": 0.2852,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.24675324675324675,
2036
+ "grad_norm": 3.7755329608917236,
2037
+ "learning_rate": 1.871901916972547e-05,
2038
+ "loss": 0.1967,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.24768089053803338,
2043
+ "grad_norm": 4.236266136169434,
2044
+ "learning_rate": 1.8703113913226847e-05,
2045
+ "loss": 0.1302,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.24860853432282004,
2050
+ "grad_norm": 5.8521599769592285,
2051
+ "learning_rate": 1.8687117365181514e-05,
2052
+ "loss": 0.2866,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.24953617810760667,
2057
+ "grad_norm": 2.9448065757751465,
2058
+ "learning_rate": 1.867102969338569e-05,
2059
+ "loss": 0.171,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.2504638218923933,
2064
+ "grad_norm": 4.411244869232178,
2065
+ "learning_rate": 1.865485106659145e-05,
2066
+ "loss": 0.1424,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.25139146567717996,
2071
+ "grad_norm": 5.2516655921936035,
2072
+ "learning_rate": 1.863858165450492e-05,
2073
+ "loss": 0.2166,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.2523191094619666,
2078
+ "grad_norm": 2.6857924461364746,
2079
+ "learning_rate": 1.862222162778454e-05,
2080
+ "loss": 0.1786,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.2532467532467532,
2085
+ "grad_norm": 6.46083927154541,
2086
+ "learning_rate": 1.8605771158039253e-05,
2087
+ "loss": 0.1817,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.2541743970315399,
2092
+ "grad_norm": 6.851802349090576,
2093
+ "learning_rate": 1.85892304178267e-05,
2094
+ "loss": 0.174,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.25510204081632654,
2099
+ "grad_norm": 4.372910022735596,
2100
+ "learning_rate": 1.8572599580651415e-05,
2101
+ "loss": 0.211,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.2560296846011132,
2106
+ "grad_norm": 8.214804649353027,
2107
+ "learning_rate": 1.8555878820963014e-05,
2108
+ "loss": 0.2295,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.2569573283858998,
2113
+ "grad_norm": 3.299901247024536,
2114
+ "learning_rate": 1.8539068314154355e-05,
2115
+ "loss": 0.1784,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.25788497217068646,
2120
+ "grad_norm": 3.1576666831970215,
2121
+ "learning_rate": 1.8522168236559693e-05,
2122
+ "loss": 0.1864,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.2588126159554731,
2127
+ "grad_norm": 6.918071269989014,
2128
+ "learning_rate": 1.8505178765452853e-05,
2129
+ "loss": 0.2015,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.2597402597402597,
2134
+ "grad_norm": 2.884896755218506,
2135
+ "learning_rate": 1.8488100079045345e-05,
2136
+ "loss": 0.1698,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.2597402597402597,
2141
+ "eval_accuracy": 0.8215077605321508,
2142
+ "eval_f1": 0.5729442970822282,
2143
+ "eval_loss": 0.37236273288726807,
2144
+ "eval_precision": 0.8709677419354839,
2145
+ "eval_recall": 0.4268774703557312,
2146
+ "eval_runtime": 48.19,
2147
+ "eval_samples_per_second": 5.727,
2148
+ "eval_steps_per_second": 0.187,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.2606679035250464,
2153
+ "grad_norm": 2.52854061126709,
2154
+ "learning_rate": 1.847093235648451e-05,
2155
+ "loss": 0.1427,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.26159554730983303,
2160
+ "grad_norm": 3.8434646129608154,
2161
+ "learning_rate": 1.8453675777851627e-05,
2162
+ "loss": 0.2429,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.2625231910946197,
2167
+ "grad_norm": 3.32183837890625,
2168
+ "learning_rate": 1.8436330524160048e-05,
2169
+ "loss": 0.1672,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.2634508348794063,
2174
+ "grad_norm": 3.077954053878784,
2175
+ "learning_rate": 1.8418896777353272e-05,
2176
+ "loss": 0.1511,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.26437847866419295,
2181
+ "grad_norm": 6.1399617195129395,
2182
+ "learning_rate": 1.8401374720303054e-05,
2183
+ "loss": 0.1989,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.2653061224489796,
2188
+ "grad_norm": 4.476570129394531,
2189
+ "learning_rate": 1.8383764536807486e-05,
2190
+ "loss": 0.2204,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.2662337662337662,
2195
+ "grad_norm": 3.3272931575775146,
2196
+ "learning_rate": 1.836606641158905e-05,
2197
+ "loss": 0.1733,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.26716141001855287,
2202
+ "grad_norm": 3.8696646690368652,
2203
+ "learning_rate": 1.8348280530292712e-05,
2204
+ "loss": 0.2698,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.2680890538033395,
2209
+ "grad_norm": 4.347325325012207,
2210
+ "learning_rate": 1.833040707948395e-05,
2211
+ "loss": 0.2098,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.2690166975881262,
2216
+ "grad_norm": 3.8024332523345947,
2217
+ "learning_rate": 1.831244624664681e-05,
2218
+ "loss": 0.2092,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.2699443413729128,
2223
+ "grad_norm": 3.742701768875122,
2224
+ "learning_rate": 1.829439822018192e-05,
2225
+ "loss": 0.1793,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.27087198515769945,
2230
+ "grad_norm": 4.459840774536133,
2231
+ "learning_rate": 1.827626318940454e-05,
2232
+ "loss": 0.224,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.2717996289424861,
2237
+ "grad_norm": 3.4559335708618164,
2238
+ "learning_rate": 1.8258041344542567e-05,
2239
+ "loss": 0.2342,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.2727272727272727,
2244
+ "grad_norm": 4.520707607269287,
2245
+ "learning_rate": 1.8239732876734525e-05,
2246
+ "loss": 0.2899,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.27365491651205937,
2251
+ "grad_norm": 3.844388246536255,
2252
+ "learning_rate": 1.822133797802758e-05,
2253
+ "loss": 0.172,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.274582560296846,
2258
+ "grad_norm": 3.944843292236328,
2259
+ "learning_rate": 1.8202856841375517e-05,
2260
+ "loss": 0.1602,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.2755102040816326,
2265
+ "grad_norm": 2.833136796951294,
2266
+ "learning_rate": 1.8184289660636715e-05,
2267
+ "loss": 0.1829,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.2764378478664193,
2272
+ "grad_norm": 5.877793312072754,
2273
+ "learning_rate": 1.816563663057211e-05,
2274
+ "loss": 0.1776,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.27736549165120594,
2279
+ "grad_norm": 7.863223552703857,
2280
+ "learning_rate": 1.8146897946843162e-05,
2281
+ "loss": 0.2734,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.2782931354359926,
2286
+ "grad_norm": 3.5865793228149414,
2287
+ "learning_rate": 1.81280738060098e-05,
2288
+ "loss": 0.1607,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.2782931354359926,
2293
+ "eval_accuracy": 0.8292682926829268,
2294
+ "eval_f1": 0.6010362694300518,
2295
+ "eval_loss": 0.34550240635871887,
2296
+ "eval_precision": 0.8721804511278195,
2297
+ "eval_recall": 0.45849802371541504,
2298
+ "eval_runtime": 47.9887,
2299
+ "eval_samples_per_second": 5.751,
2300
+ "eval_steps_per_second": 0.188,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.58457253545902e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null