mtzig commited on
Commit
137f25d
·
verified ·
1 Parent(s): ccf0dda

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35f51831b2b098cdce1336c36fdb466a2549cbaa1f8a57f3dfb51b4a2a5bf371
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e380ae04a5173aeaed71f5a23293af6a6b5ce9b37a1646c0f6027f825d779fc
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a789181143a79739789063cd00a232ab9f16e3bca19ecdc66bcebfc70abdf7f0
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b043b19da04e16af34cc8400633335c68ab02712105f1221be29791b7d4e409
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b06c737fb3780906c6db6f49888f41e1ff147cd36f721e2ec559502e5722dcf
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7f2a92d7df0f41408c607126dde2ec742d9311ee46369d1b8e81e62ba64c29
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95524888ea03dc8db342a6452b79ac2dc498646d4c1397845f5c61de5e72a273
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddb9986bf6ad380f520fbe804799f709d80c796ef3ff88134b3291fe5611761
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:405be8d53a873909641aaba4d30e01e797a1e6db0878263ef451a17ff9e941b9
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5ac5a37a7f3a37a7f5328e215de1663f8e85b03df885c4f3a38576bbb58b65
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4da83fa371fc825b162abc2365a97c78bdd6c68b3c8715678d0f9f6a05017b53
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9c10492278bc53059b6ed6f765490ebda8641ddb2ca6422c5a3ff08f7b12216
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cda0931130711d5820d481d7b5cc9a36c4df6219fb59d6ebb68f7ab10a011c4b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aac0a2ddbb2e0439e67de80ba07301bbb4f6fae538d608784bb99a990eb4374
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d48786f7f9663e086296dbd832d3f41b07c50d093d8d13185ba7c06b778eba15
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a08e5416ead83a8ca5a4737f339d26abe014328af01895f5dc9b9056c94042
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad1d2ce7a6ea45f3182b7421bc96713b2844cd0ec18a52bec861802d753d23df
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f53af7b36bcb1f4a3830f3094a6baae96f54d6751f1aca6ab0241469f55b4c77
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bde33c3dc9d4b5847aa5e82a41ef1d715b6cab5c6f68c90d9c12b98c9395b5a1
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b93e300fca3864a3b00ebf8f20bb271d3ee6a1118129c64855b165724ec8737a
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb9b1e0f74c5f2631e58aded928e5d64789892339a1cd1a1bb054b2a8717bf3
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aeed4b8b1f8111068fdf649eef309274cafb5724b7079e7c1ab8b7d24799ae1
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e93d6940d870db9ebba78cee7722d0384b494610e71e7f8b2e22bb0fd8e406ed
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bebf1ac8b6d22b64f12ee5515472ef4631edd26eaeae162231d65e567dd578f
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a024530de56227bb3ef9eb28b732e8ef3d765c77ebd0a0c5bc59f62e1682f1a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba5d98c98ae03b619b5cc816786d7328ffd6502c6e3927d2220789c3367ca675
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2510460251046025,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.298,
1540
  "eval_steps_per_second": 0.172,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 7.32374373582766e+16,
1562
  "train_batch_size": 6,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.37656903765690375,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.298,
1540
  "eval_steps_per_second": 0.172,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.25230125523012553,
1545
+ "grad_norm": 3.1597511768341064,
1546
+ "learning_rate": 1.8623458640038817e-05,
1547
+ "loss": 0.2417,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.2535564853556485,
1552
+ "grad_norm": 4.373691558837891,
1553
+ "learning_rate": 1.8601159918043533e-05,
1554
+ "loss": 0.2408,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.25481171548117154,
1559
+ "grad_norm": 3.5026726722717285,
1560
+ "learning_rate": 1.857869560760377e-05,
1561
+ "loss": 0.281,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.2560669456066946,
1566
+ "grad_norm": 4.196898460388184,
1567
+ "learning_rate": 1.85560661411996e-05,
1568
+ "loss": 0.2201,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.25732217573221755,
1573
+ "grad_norm": 3.8971402645111084,
1574
+ "learning_rate": 1.8533271954490655e-05,
1575
+ "loss": 0.2692,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.2585774058577406,
1580
+ "grad_norm": 6.247049808502197,
1581
+ "learning_rate": 1.8510313486307734e-05,
1582
+ "loss": 0.2653,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.2598326359832636,
1587
+ "grad_norm": 5.326446056365967,
1588
+ "learning_rate": 1.848719117864437e-05,
1589
+ "loss": 0.2857,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.2610878661087866,
1594
+ "grad_norm": 3.6153714656829834,
1595
+ "learning_rate": 1.846390547664831e-05,
1596
+ "loss": 0.262,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.2623430962343096,
1601
+ "grad_norm": 6.952093124389648,
1602
+ "learning_rate": 1.8440456828612946e-05,
1603
+ "loss": 0.2807,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.26359832635983266,
1608
+ "grad_norm": 3.8363044261932373,
1609
+ "learning_rate": 1.841684568596869e-05,
1610
+ "loss": 0.2604,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.26485355648535563,
1615
+ "grad_norm": 3.643761396408081,
1616
+ "learning_rate": 1.8393072503274277e-05,
1617
+ "loss": 0.2796,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.26610878661087867,
1622
+ "grad_norm": 3.259951114654541,
1623
+ "learning_rate": 1.836913773820802e-05,
1624
+ "loss": 0.2724,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.2673640167364017,
1629
+ "grad_norm": 4.189282417297363,
1630
+ "learning_rate": 1.834504185155899e-05,
1631
+ "loss": 0.2455,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.2686192468619247,
1636
+ "grad_norm": 4.426260948181152,
1637
+ "learning_rate": 1.832078530721816e-05,
1638
+ "loss": 0.2975,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.2698744769874477,
1643
+ "grad_norm": 4.503783226013184,
1644
+ "learning_rate": 1.829636857216945e-05,
1645
+ "loss": 0.2852,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.2711297071129707,
1650
+ "grad_norm": 4.618401527404785,
1651
+ "learning_rate": 1.8271792116480767e-05,
1652
+ "loss": 0.3006,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.2723849372384937,
1657
+ "grad_norm": 5.484090805053711,
1658
+ "learning_rate": 1.8247056413294927e-05,
1659
+ "loss": 0.3397,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.27364016736401675,
1664
+ "grad_norm": 4.215097427368164,
1665
+ "learning_rate": 1.8222161938820564e-05,
1666
+ "loss": 0.312,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.27489539748953973,
1671
+ "grad_norm": 2.8045787811279297,
1672
+ "learning_rate": 1.8197109172322958e-05,
1673
+ "loss": 0.2896,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.27615062761506276,
1678
+ "grad_norm": 3.3742685317993164,
1679
+ "learning_rate": 1.8171898596114804e-05,
1680
+ "loss": 0.2715,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.27615062761506276,
1685
+ "eval_accuracy": 0.8410596026490066,
1686
+ "eval_f1": 0.6587677725118484,
1687
+ "eval_loss": 0.3223263919353485,
1688
+ "eval_precision": 0.8273809523809523,
1689
+ "eval_recall": 0.547244094488189,
1690
+ "eval_runtime": 52.2317,
1691
+ "eval_samples_per_second": 5.303,
1692
+ "eval_steps_per_second": 0.172,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.2774058577405858,
1697
+ "grad_norm": 2.477954149246216,
1698
+ "learning_rate": 1.8146530695546934e-05,
1699
+ "loss": 0.2171,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.27866108786610877,
1704
+ "grad_norm": 3.73885440826416,
1705
+ "learning_rate": 1.8121005958998968e-05,
1706
+ "loss": 0.3282,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.2799163179916318,
1711
+ "grad_norm": 3.0945334434509277,
1712
+ "learning_rate": 1.8095324877869902e-05,
1713
+ "loss": 0.2823,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.28117154811715483,
1718
+ "grad_norm": 3.98866868019104,
1719
+ "learning_rate": 1.8069487946568675e-05,
1720
+ "loss": 0.3008,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.2824267782426778,
1725
+ "grad_norm": 3.4288768768310547,
1726
+ "learning_rate": 1.804349566250462e-05,
1727
+ "loss": 0.2644,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.28368200836820084,
1732
+ "grad_norm": 3.3643836975097656,
1733
+ "learning_rate": 1.801734852607791e-05,
1734
+ "loss": 0.2543,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.2849372384937239,
1739
+ "grad_norm": 5.725021839141846,
1740
+ "learning_rate": 1.799104704066991e-05,
1741
+ "loss": 0.2827,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.28619246861924685,
1746
+ "grad_norm": 4.484889507293701,
1747
+ "learning_rate": 1.79645917126335e-05,
1748
+ "loss": 0.3096,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.2874476987447699,
1753
+ "grad_norm": 5.622531414031982,
1754
+ "learning_rate": 1.7937983051283312e-05,
1755
+ "loss": 0.3283,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.28870292887029286,
1760
+ "grad_norm": 4.898491382598877,
1761
+ "learning_rate": 1.7911221568885935e-05,
1762
+ "loss": 0.2316,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.2899581589958159,
1767
+ "grad_norm": 4.367154121398926,
1768
+ "learning_rate": 1.7884307780650047e-05,
1769
+ "loss": 0.2739,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.29121338912133893,
1774
+ "grad_norm": 4.016841888427734,
1775
+ "learning_rate": 1.7857242204716497e-05,
1776
+ "loss": 0.2375,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.2924686192468619,
1781
+ "grad_norm": 3.615976333618164,
1782
+ "learning_rate": 1.783002536214834e-05,
1783
+ "loss": 0.2644,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.29372384937238494,
1788
+ "grad_norm": 5.212274074554443,
1789
+ "learning_rate": 1.780265777692079e-05,
1790
+ "loss": 0.3412,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.29497907949790797,
1795
+ "grad_norm": 3.3848087787628174,
1796
+ "learning_rate": 1.7775139975911143e-05,
1797
+ "loss": 0.2489,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.29623430962343095,
1802
+ "grad_norm": 5.973453998565674,
1803
+ "learning_rate": 1.7747472488888622e-05,
1804
+ "loss": 0.2657,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.297489539748954,
1809
+ "grad_norm": 4.158175468444824,
1810
+ "learning_rate": 1.77196558485042e-05,
1811
+ "loss": 0.2951,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.298744769874477,
1816
+ "grad_norm": 3.3108043670654297,
1817
+ "learning_rate": 1.7691690590280325e-05,
1818
+ "loss": 0.26,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.3,
1823
+ "grad_norm": 4.062819480895996,
1824
+ "learning_rate": 1.7663577252600612e-05,
1825
+ "loss": 0.2535,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.301255230125523,
1830
+ "grad_norm": 4.0478339195251465,
1831
+ "learning_rate": 1.763531637669949e-05,
1832
+ "loss": 0.2737,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.301255230125523,
1837
+ "eval_accuracy": 0.8520971302428256,
1838
+ "eval_f1": 0.6995515695067265,
1839
+ "eval_loss": 0.3201642632484436,
1840
+ "eval_precision": 0.8125,
1841
+ "eval_recall": 0.6141732283464567,
1842
+ "eval_runtime": 52.8485,
1843
+ "eval_samples_per_second": 5.241,
1844
+ "eval_steps_per_second": 0.17,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.302510460251046,
1849
+ "grad_norm": 5.782260894775391,
1850
+ "learning_rate": 1.760690850665177e-05,
1851
+ "loss": 0.2356,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.30376569037656903,
1856
+ "grad_norm": 4.108422756195068,
1857
+ "learning_rate": 1.7578354189362183e-05,
1858
+ "loss": 0.2658,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.30502092050209206,
1863
+ "grad_norm": 2.872807264328003,
1864
+ "learning_rate": 1.7549653974554835e-05,
1865
+ "loss": 0.3048,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.30627615062761504,
1870
+ "grad_norm": 3.7681846618652344,
1871
+ "learning_rate": 1.752080841476264e-05,
1872
+ "loss": 0.2832,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.3075313807531381,
1877
+ "grad_norm": 6.7302069664001465,
1878
+ "learning_rate": 1.7491818065316676e-05,
1879
+ "loss": 0.2518,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.3087866108786611,
1884
+ "grad_norm": 7.851168155670166,
1885
+ "learning_rate": 1.7462683484335477e-05,
1886
+ "loss": 0.2188,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.3100418410041841,
1891
+ "grad_norm": 5.26230001449585,
1892
+ "learning_rate": 1.7433405232714325e-05,
1893
+ "loss": 0.2898,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.3112970711297071,
1898
+ "grad_norm": 3.618230104446411,
1899
+ "learning_rate": 1.7403983874114422e-05,
1900
+ "loss": 0.2303,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.31255230125523015,
1905
+ "grad_norm": 3.8040518760681152,
1906
+ "learning_rate": 1.7374419974952045e-05,
1907
+ "loss": 0.3179,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.3138075313807531,
1912
+ "grad_norm": 3.1975717544555664,
1913
+ "learning_rate": 1.734471410438765e-05,
1914
+ "loss": 0.2503,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.31506276150627616,
1919
+ "grad_norm": 2.8378207683563232,
1920
+ "learning_rate": 1.731486683431491e-05,
1921
+ "loss": 0.2424,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.3163179916317992,
1926
+ "grad_norm": 5.816548824310303,
1927
+ "learning_rate": 1.728487873934969e-05,
1928
+ "loss": 0.2567,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.31757322175732217,
1933
+ "grad_norm": 3.5895259380340576,
1934
+ "learning_rate": 1.7254750396819008e-05,
1935
+ "loss": 0.2762,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.3188284518828452,
1940
+ "grad_norm": 3.293178081512451,
1941
+ "learning_rate": 1.7224482386749916e-05,
1942
+ "loss": 0.2801,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.3200836820083682,
1947
+ "grad_norm": 3.76770281791687,
1948
+ "learning_rate": 1.719407529185831e-05,
1949
+ "loss": 0.2545,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.3213389121338912,
1954
+ "grad_norm": 3.1176042556762695,
1955
+ "learning_rate": 1.7163529697537756e-05,
1956
+ "loss": 0.2608,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.32259414225941424,
1961
+ "grad_norm": 3.789315700531006,
1962
+ "learning_rate": 1.7132846191848167e-05,
1963
+ "loss": 0.2708,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.3238493723849372,
1968
+ "grad_norm": 5.817142963409424,
1969
+ "learning_rate": 1.7102025365504524e-05,
1970
+ "loss": 0.3254,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.32510460251046025,
1975
+ "grad_norm": 4.174067497253418,
1976
+ "learning_rate": 1.7071067811865477e-05,
1977
+ "loss": 0.2826,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.3263598326359833,
1982
+ "grad_norm": 4.383941173553467,
1983
+ "learning_rate": 1.7039974126921946e-05,
1984
+ "loss": 0.3245,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.3263598326359833,
1989
+ "eval_accuracy": 0.8465783664459161,
1990
+ "eval_f1": 0.6774941995359629,
1991
+ "eval_loss": 0.30984166264533997,
1992
+ "eval_precision": 0.8248587570621468,
1993
+ "eval_recall": 0.5748031496062992,
1994
+ "eval_runtime": 52.3032,
1995
+ "eval_samples_per_second": 5.296,
1996
+ "eval_steps_per_second": 0.172,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.32761506276150626,
2001
+ "grad_norm": 4.471529960632324,
2002
+ "learning_rate": 1.7008744909285626e-05,
2003
+ "loss": 0.2658,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.3288702928870293,
2008
+ "grad_norm": 4.479955673217773,
2009
+ "learning_rate": 1.6977380760177467e-05,
2010
+ "loss": 0.3076,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.3301255230125523,
2015
+ "grad_norm": 3.6632466316223145,
2016
+ "learning_rate": 1.694588228341611e-05,
2017
+ "loss": 0.2387,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.3313807531380753,
2022
+ "grad_norm": 3.813127040863037,
2023
+ "learning_rate": 1.691425008540625e-05,
2024
+ "loss": 0.2575,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.33263598326359833,
2029
+ "grad_norm": 3.7820916175842285,
2030
+ "learning_rate": 1.6882484775126968e-05,
2031
+ "loss": 0.2517,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.33389121338912137,
2036
+ "grad_norm": 3.487283229827881,
2037
+ "learning_rate": 1.6850586964120005e-05,
2038
+ "loss": 0.2898,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.33514644351464434,
2043
+ "grad_norm": 5.123818397521973,
2044
+ "learning_rate": 1.6818557266477993e-05,
2045
+ "loss": 0.2758,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.3364016736401674,
2050
+ "grad_norm": 3.208160400390625,
2051
+ "learning_rate": 1.6786396298832622e-05,
2052
+ "loss": 0.2846,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.33765690376569035,
2057
+ "grad_norm": 2.8521032333374023,
2058
+ "learning_rate": 1.6754104680342783e-05,
2059
+ "loss": 0.2573,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.3389121338912134,
2064
+ "grad_norm": 2.8169782161712646,
2065
+ "learning_rate": 1.6721683032682637e-05,
2066
+ "loss": 0.2259,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.3401673640167364,
2071
+ "grad_norm": 3.7779228687286377,
2072
+ "learning_rate": 1.6689131980029647e-05,
2073
+ "loss": 0.2947,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.3414225941422594,
2078
+ "grad_norm": 4.368408203125,
2079
+ "learning_rate": 1.6656452149052568e-05,
2080
+ "loss": 0.2654,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.3426778242677824,
2085
+ "grad_norm": 3.421369791030884,
2086
+ "learning_rate": 1.662364416889938e-05,
2087
+ "loss": 0.2921,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.34393305439330546,
2092
+ "grad_norm": 4.275522232055664,
2093
+ "learning_rate": 1.6590708671185176e-05,
2094
+ "loss": 0.2527,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.34518828451882844,
2099
+ "grad_norm": 3.0027596950531006,
2100
+ "learning_rate": 1.6557646289979996e-05,
2101
+ "loss": 0.2031,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.34644351464435147,
2106
+ "grad_norm": 3.2799339294433594,
2107
+ "learning_rate": 1.6524457661796626e-05,
2108
+ "loss": 0.2276,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.3476987447698745,
2113
+ "grad_norm": 3.7090659141540527,
2114
+ "learning_rate": 1.6491143425578345e-05,
2115
+ "loss": 0.2264,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.3489539748953975,
2120
+ "grad_norm": 6.081251621246338,
2121
+ "learning_rate": 1.645770422268662e-05,
2122
+ "loss": 0.3315,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.3502092050209205,
2127
+ "grad_norm": 5.695575714111328,
2128
+ "learning_rate": 1.6424140696888765e-05,
2129
+ "loss": 0.2948,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.3514644351464435,
2134
+ "grad_norm": 4.191822052001953,
2135
+ "learning_rate": 1.639045349434554e-05,
2136
+ "loss": 0.2868,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.3514644351464435,
2141
+ "eval_accuracy": 0.8432671081677704,
2142
+ "eval_f1": 0.6830357142857143,
2143
+ "eval_loss": 0.3159337043762207,
2144
+ "eval_precision": 0.788659793814433,
2145
+ "eval_recall": 0.6023622047244095,
2146
+ "eval_runtime": 52.6769,
2147
+ "eval_samples_per_second": 5.258,
2148
+ "eval_steps_per_second": 0.171,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.3527196652719665,
2153
+ "grad_norm": 4.9057183265686035,
2154
+ "learning_rate": 1.6356643263598716e-05,
2155
+ "loss": 0.3545,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.35397489539748955,
2160
+ "grad_norm": 6.470303058624268,
2161
+ "learning_rate": 1.6322710655558577e-05,
2162
+ "loss": 0.3414,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.35523012552301253,
2167
+ "grad_norm": 3.9251017570495605,
2168
+ "learning_rate": 1.6288656323491415e-05,
2169
+ "loss": 0.2573,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.35648535564853556,
2174
+ "grad_norm": 4.604090213775635,
2175
+ "learning_rate": 1.6254480923006924e-05,
2176
+ "loss": 0.226,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.3577405857740586,
2181
+ "grad_norm": 6.23361873626709,
2182
+ "learning_rate": 1.6220185112045606e-05,
2183
+ "loss": 0.2693,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.35899581589958157,
2188
+ "grad_norm": 3.5196187496185303,
2189
+ "learning_rate": 1.6185769550866073e-05,
2190
+ "loss": 0.2104,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.3602510460251046,
2195
+ "grad_norm": 5.589550495147705,
2196
+ "learning_rate": 1.6151234902032374e-05,
2197
+ "loss": 0.3379,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.36150627615062764,
2202
+ "grad_norm": 3.052987813949585,
2203
+ "learning_rate": 1.6116581830401193e-05,
2204
+ "loss": 0.2646,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.3627615062761506,
2209
+ "grad_norm": 2.715062141418457,
2210
+ "learning_rate": 1.60818110031091e-05,
2211
+ "loss": 0.2731,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.36401673640167365,
2216
+ "grad_norm": 3.9851012229919434,
2217
+ "learning_rate": 1.6046923089559667e-05,
2218
+ "loss": 0.2482,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.3652719665271967,
2223
+ "grad_norm": 4.131580352783203,
2224
+ "learning_rate": 1.6011918761410596e-05,
2225
+ "loss": 0.2916,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.36652719665271966,
2230
+ "grad_norm": 5.364291667938232,
2231
+ "learning_rate": 1.5976798692560796e-05,
2232
+ "loss": 0.3029,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.3677824267782427,
2237
+ "grad_norm": 3.139458417892456,
2238
+ "learning_rate": 1.5941563559137398e-05,
2239
+ "loss": 0.2396,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.36903765690376567,
2244
+ "grad_norm": 3.1862568855285645,
2245
+ "learning_rate": 1.5906214039482732e-05,
2246
+ "loss": 0.2504,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.3702928870292887,
2251
+ "grad_norm": 3.489682912826538,
2252
+ "learning_rate": 1.5870750814141296e-05,
2253
+ "loss": 0.2214,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.37154811715481173,
2258
+ "grad_norm": 4.336936950683594,
2259
+ "learning_rate": 1.5835174565846624e-05,
2260
+ "loss": 0.3056,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.3728033472803347,
2265
+ "grad_norm": 3.281315803527832,
2266
+ "learning_rate": 1.579948597950815e-05,
2267
+ "loss": 0.2579,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.37405857740585774,
2272
+ "grad_norm": 7.08855676651001,
2273
+ "learning_rate": 1.576368574219804e-05,
2274
+ "loss": 0.295,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.37531380753138077,
2279
+ "grad_norm": 5.177116394042969,
2280
+ "learning_rate": 1.5727774543137927e-05,
2281
+ "loss": 0.2363,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.37656903765690375,
2286
+ "grad_norm": 2.4472217559814453,
2287
+ "learning_rate": 1.5691753073685692e-05,
2288
+ "loss": 0.2601,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.37656903765690375,
2293
+ "eval_accuracy": 0.8587196467991169,
2294
+ "eval_f1": 0.7387755102040816,
2295
+ "eval_loss": 0.31048765778541565,
2296
+ "eval_precision": 0.7669491525423728,
2297
+ "eval_recall": 0.7125984251968503,
2298
+ "eval_runtime": 52.6769,
2299
+ "eval_samples_per_second": 5.258,
2300
+ "eval_steps_per_second": 0.171,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 1.0951134131571917e+17,
2322
  "train_batch_size": 6,
2323
  "trial_name": null,
2324
  "trial_params": null