mtzig commited on
Commit
c8ef62a
·
verified ·
1 Parent(s): aafb541

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eee0bc20bfe612a2406db1927bad535b871029a1459cdfff99c1d8c6c7f3b63
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6939fa8cdf25c762650f2d3d3355a3af6563f0ff00721b627282f60908c3f59
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8480d8b71bc4ba12fadce2b7092485478b8c309ecce318c15ffc6f83a418ea33
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7b3f1a318e2cc496fca5a0e17c82951b82e48847a73776b67435515bd3b117
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:865ffb2bdf7738b5a7a48e25068e631a1f4cfd3495ea1df1c76166542115412a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c357e6d93334bf277b9c7d69bbecfbf0d574ee83594d4aefca9a358b7697dc63
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bc6404ab67370a58b70ca5d2e8919c5e01e34f1cb289a4a6bd798d70aee2dbd
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3df6316f9b70a49fe93ecd1dcb217b8cf2b9f1d06201e9cd378d30cf1ac2444
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3498e0b6a4e7ed2241f24f000b2120ffa644d285a44cfde97745c9efb6ed358b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:735c4d9885bdc1f40b2e139d1c5be6fdf8fb80870faadee69ed0f5b65eb4bf69
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d06a365662a6d32a03d081ca66ae94093585c255a49fe32e4fc6101155e341c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa86e6ee20e276dd223620f5f73051d773a3cf00f532b08afcae9924c2b3c84
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44818d96fc5cb3fb73cb12c5017e94708a24961757ad115fff879a4c54351a1b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1545702147d9ee97bdd5dc7bbcde2426e2ae21146f2807dae5aded5896daaa7
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7503aeea618e7970daff2e762d6b9cc3c0b593f25c7e566d92c8b37634b729e0
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07fbf2ec86fb6b1d0da0836072b0dca651e3c394204002f1ed426eecc30a569a
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7be0f10bff4b59eb4d3472c8dc5f6f8b12c709dd561a83d4586f3461ec1745a5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97d0b3b52285a9a23cb592a2b624fc30f4e8b1b49dee2093250a42185714fae
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e61888020fafc126b7e547b5961b63a5561eea0a9665cf9acb78e192fc0856bc
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c4dc09b19022d70ab1e7f40c998143b4e686e79e0e83e59d15521c0f0652bd
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:065119fcdbace59dd30c03371fc097ed8d58b83537d1b5e3a1f5c321afd26dfd
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed26153973e4964d16eab6644bf6fd88bae202c6e07155a00cad65a1de0f6bcf
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:061f461111f5cd0052d853db52e46aef61f148d9da594c2cc07a97c23921266c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddf99c805abb2056960209b3190c0de826effa11fd4c4b1fb50ceb4079b7f5b
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2f6d7e0e198940381bc01669f2b59ed3c54273b38889812ff9b29559c995120
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36e32dd1a5ae7a273ff3b5c3ef07147f38ae674d07fd46f56c265e9de492bff2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.24752475247524752,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.728,
1540
  "eval_steps_per_second": 0.187,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.099415773216768e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3712871287128713,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.728,
1540
  "eval_steps_per_second": 0.187,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.24876237623762376,
1545
+ "grad_norm": 8.231904983520508,
1546
+ "learning_rate": 1.8685351516343277e-05,
1547
+ "loss": 0.2536,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.25,
1552
+ "grad_norm": 4.1948041915893555,
1553
+ "learning_rate": 1.8663852883425045e-05,
1554
+ "loss": 0.295,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.25123762376237624,
1559
+ "grad_norm": 5.275994777679443,
1560
+ "learning_rate": 1.86421924643655e-05,
1561
+ "loss": 0.2739,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.2524752475247525,
1566
+ "grad_norm": 4.482490062713623,
1567
+ "learning_rate": 1.8620370663644676e-05,
1568
+ "loss": 0.1973,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.2537128712871287,
1573
+ "grad_norm": 3.5862739086151123,
1574
+ "learning_rate": 1.8598387888756224e-05,
1575
+ "loss": 0.2146,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.25495049504950495,
1580
+ "grad_norm": 3.1812057495117188,
1581
+ "learning_rate": 1.857624455019976e-05,
1582
+ "loss": 0.2104,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.2561881188118812,
1587
+ "grad_norm": 6.783030986785889,
1588
+ "learning_rate": 1.855394106147322e-05,
1589
+ "loss": 0.2979,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.25742574257425743,
1594
+ "grad_norm": 6.715686321258545,
1595
+ "learning_rate": 1.853147783906514e-05,
1596
+ "loss": 0.2952,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.25866336633663367,
1601
+ "grad_norm": 5.6060028076171875,
1602
+ "learning_rate": 1.8508855302446868e-05,
1603
+ "loss": 0.2323,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.2599009900990099,
1608
+ "grad_norm": 4.204987049102783,
1609
+ "learning_rate": 1.8486073874064745e-05,
1610
+ "loss": 0.216,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.26113861386138615,
1615
+ "grad_norm": 5.449676036834717,
1616
+ "learning_rate": 1.84631339793322e-05,
1617
+ "loss": 0.1925,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.2623762376237624,
1622
+ "grad_norm": 4.839028835296631,
1623
+ "learning_rate": 1.8440036046621816e-05,
1624
+ "loss": 0.2531,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.2636138613861386,
1629
+ "grad_norm": 5.816053867340088,
1630
+ "learning_rate": 1.8416780507257334e-05,
1631
+ "loss": 0.2866,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.26485148514851486,
1636
+ "grad_norm": 4.106687545776367,
1637
+ "learning_rate": 1.8393367795505587e-05,
1638
+ "loss": 0.1797,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.2660891089108911,
1643
+ "grad_norm": 3.8408498764038086,
1644
+ "learning_rate": 1.8369798348568403e-05,
1645
+ "loss": 0.2328,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.26732673267326734,
1650
+ "grad_norm": 3.9387855529785156,
1651
+ "learning_rate": 1.834607260657443e-05,
1652
+ "loss": 0.2067,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.2685643564356436,
1657
+ "grad_norm": 3.855027198791504,
1658
+ "learning_rate": 1.832219101257092e-05,
1659
+ "loss": 0.2408,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.2698019801980198,
1664
+ "grad_norm": 5.5736494064331055,
1665
+ "learning_rate": 1.829815401251547e-05,
1666
+ "loss": 0.2225,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.27103960396039606,
1671
+ "grad_norm": 5.179149150848389,
1672
+ "learning_rate": 1.8273962055267667e-05,
1673
+ "loss": 0.2575,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.2722772277227723,
1678
+ "grad_norm": 8.503008842468262,
1679
+ "learning_rate": 1.8249615592580733e-05,
1680
+ "loss": 0.2965,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.2722772277227723,
1685
+ "eval_accuracy": 0.844789356984479,
1686
+ "eval_f1": 0.6682464454976303,
1687
+ "eval_loss": 0.3314219117164612,
1688
+ "eval_precision": 0.834319526627219,
1689
+ "eval_recall": 0.5573122529644269,
1690
+ "eval_runtime": 48.2502,
1691
+ "eval_samples_per_second": 5.72,
1692
+ "eval_steps_per_second": 0.187,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.27351485148514854,
1697
+ "grad_norm": 4.444825172424316,
1698
+ "learning_rate": 1.822511507909307e-05,
1699
+ "loss": 0.1907,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.2747524752475248,
1704
+ "grad_norm": 6.425011157989502,
1705
+ "learning_rate": 1.8200460972319786e-05,
1706
+ "loss": 0.2938,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.275990099009901,
1711
+ "grad_norm": 3.5462961196899414,
1712
+ "learning_rate": 1.817565373264413e-05,
1713
+ "loss": 0.2045,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.27722772277227725,
1718
+ "grad_norm": 5.254908084869385,
1719
+ "learning_rate": 1.8150693823308913e-05,
1720
+ "loss": 0.1644,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.2784653465346535,
1725
+ "grad_norm": 4.101227283477783,
1726
+ "learning_rate": 1.8125581710407864e-05,
1727
+ "loss": 0.1875,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.27970297029702973,
1732
+ "grad_norm": 3.370792865753174,
1733
+ "learning_rate": 1.8100317862876902e-05,
1734
+ "loss": 0.1715,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.28094059405940597,
1739
+ "grad_norm": 4.758403778076172,
1740
+ "learning_rate": 1.8074902752485392e-05,
1741
+ "loss": 0.2956,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.28217821782178215,
1746
+ "grad_norm": 5.75641393661499,
1747
+ "learning_rate": 1.8049336853827343e-05,
1748
+ "loss": 0.2601,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.2834158415841584,
1753
+ "grad_norm": 3.241687059402466,
1754
+ "learning_rate": 1.8023620644312538e-05,
1755
+ "loss": 0.2022,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.28465346534653463,
1760
+ "grad_norm": 4.085322856903076,
1761
+ "learning_rate": 1.7997754604157607e-05,
1762
+ "loss": 0.2132,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.28589108910891087,
1767
+ "grad_norm": 5.415487766265869,
1768
+ "learning_rate": 1.797173921637709e-05,
1769
+ "loss": 0.1825,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.2871287128712871,
1774
+ "grad_norm": 4.1402907371521,
1775
+ "learning_rate": 1.794557496677438e-05,
1776
+ "loss": 0.2029,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.28836633663366334,
1781
+ "grad_norm": 4.597172737121582,
1782
+ "learning_rate": 1.791926234393268e-05,
1783
+ "loss": 0.1929,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.2896039603960396,
1788
+ "grad_norm": 6.450316905975342,
1789
+ "learning_rate": 1.7892801839205867e-05,
1790
+ "loss": 0.2061,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.2908415841584158,
1795
+ "grad_norm": 4.549274444580078,
1796
+ "learning_rate": 1.786619394670933e-05,
1797
+ "loss": 0.2288,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.29207920792079206,
1802
+ "grad_norm": 8.562817573547363,
1803
+ "learning_rate": 1.7839439163310714e-05,
1804
+ "loss": 0.2791,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.2933168316831683,
1809
+ "grad_norm": 4.906472206115723,
1810
+ "learning_rate": 1.7812537988620678e-05,
1811
+ "loss": 0.2505,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.29455445544554454,
1816
+ "grad_norm": 4.514908790588379,
1817
+ "learning_rate": 1.7785490924983526e-05,
1818
+ "loss": 0.2033,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.2957920792079208,
1823
+ "grad_norm": 5.586214065551758,
1824
+ "learning_rate": 1.7758298477467865e-05,
1825
+ "loss": 0.1828,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.297029702970297,
1830
+ "grad_norm": 11.2735595703125,
1831
+ "learning_rate": 1.7730961153857155e-05,
1832
+ "loss": 0.2379,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.297029702970297,
1837
+ "eval_accuracy": 0.8148558758314856,
1838
+ "eval_f1": 0.5570291777188329,
1839
+ "eval_loss": 0.3735515773296356,
1840
+ "eval_precision": 0.8467741935483871,
1841
+ "eval_recall": 0.4150197628458498,
1842
+ "eval_runtime": 48.8987,
1843
+ "eval_samples_per_second": 5.644,
1844
+ "eval_steps_per_second": 0.184,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.29826732673267325,
1849
+ "grad_norm": 6.920717239379883,
1850
+ "learning_rate": 1.7703479464640218e-05,
1851
+ "loss": 0.2756,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.2995049504950495,
1856
+ "grad_norm": 5.730903625488281,
1857
+ "learning_rate": 1.767585392300172e-05,
1858
+ "loss": 0.1745,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.30074257425742573,
1863
+ "grad_norm": 4.035462856292725,
1864
+ "learning_rate": 1.764808504481259e-05,
1865
+ "loss": 0.1666,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.30198019801980197,
1870
+ "grad_norm": 4.878346920013428,
1871
+ "learning_rate": 1.7620173348620368e-05,
1872
+ "loss": 0.2491,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.3032178217821782,
1877
+ "grad_norm": 3.8003768920898438,
1878
+ "learning_rate": 1.7592119355639545e-05,
1879
+ "loss": 0.2041,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.30445544554455445,
1884
+ "grad_norm": 6.53809118270874,
1885
+ "learning_rate": 1.7563923589741806e-05,
1886
+ "loss": 0.2415,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.3056930693069307,
1891
+ "grad_norm": 3.5466408729553223,
1892
+ "learning_rate": 1.7535586577446274e-05,
1893
+ "loss": 0.1963,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.3069306930693069,
1898
+ "grad_norm": 4.167630195617676,
1899
+ "learning_rate": 1.7507108847909656e-05,
1900
+ "loss": 0.2261,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.30816831683168316,
1905
+ "grad_norm": 4.359383583068848,
1906
+ "learning_rate": 1.7478490932916374e-05,
1907
+ "loss": 0.1888,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.3094059405940594,
1912
+ "grad_norm": 8.298726081848145,
1913
+ "learning_rate": 1.744973336686862e-05,
1914
+ "loss": 0.2532,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.31064356435643564,
1919
+ "grad_norm": 5.459946632385254,
1920
+ "learning_rate": 1.74208366867764e-05,
1921
+ "loss": 0.2579,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.3118811881188119,
1926
+ "grad_norm": 4.424745559692383,
1927
+ "learning_rate": 1.7391801432247487e-05,
1928
+ "loss": 0.2071,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.3131188118811881,
1933
+ "grad_norm": 4.382404804229736,
1934
+ "learning_rate": 1.7362628145477355e-05,
1935
+ "loss": 0.2773,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.31435643564356436,
1940
+ "grad_norm": 3.338047742843628,
1941
+ "learning_rate": 1.7333317371239046e-05,
1942
+ "loss": 0.2231,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.3155940594059406,
1947
+ "grad_norm": 3.33626389503479,
1948
+ "learning_rate": 1.7303869656872994e-05,
1949
+ "loss": 0.2046,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.31683168316831684,
1954
+ "grad_norm": 3.3837637901306152,
1955
+ "learning_rate": 1.727428555227683e-05,
1956
+ "loss": 0.1503,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.3180693069306931,
1961
+ "grad_norm": 3.3898661136627197,
1962
+ "learning_rate": 1.7244565609895074e-05,
1963
+ "loss": 0.1641,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.3193069306930693,
1968
+ "grad_norm": 5.7499918937683105,
1969
+ "learning_rate": 1.721471038470885e-05,
1970
+ "loss": 0.268,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.32054455445544555,
1975
+ "grad_norm": 4.935744762420654,
1976
+ "learning_rate": 1.7184720434225518e-05,
1977
+ "loss": 0.2289,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.3217821782178218,
1982
+ "grad_norm": 3.377199411392212,
1983
+ "learning_rate": 1.715459631846824e-05,
1984
+ "loss": 0.1587,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.3217821782178218,
1989
+ "eval_accuracy": 0.8403547671840355,
1990
+ "eval_f1": 0.6435643564356436,
1991
+ "eval_loss": 0.33147069811820984,
1992
+ "eval_precision": 0.8609271523178808,
1993
+ "eval_recall": 0.5138339920948617,
1994
+ "eval_runtime": 47.9408,
1995
+ "eval_samples_per_second": 5.757,
1996
+ "eval_steps_per_second": 0.188,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.32301980198019803,
2001
+ "grad_norm": 3.6289126873016357,
2002
+ "learning_rate": 1.712433859996555e-05,
2003
+ "loss": 0.2245,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.32425742574257427,
2008
+ "grad_norm": 3.2000958919525146,
2009
+ "learning_rate": 1.7093947843740843e-05,
2010
+ "loss": 0.2251,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.3254950495049505,
2015
+ "grad_norm": 4.913848400115967,
2016
+ "learning_rate": 1.706342461730181e-05,
2017
+ "loss": 0.1782,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.32673267326732675,
2022
+ "grad_norm": 5.196519374847412,
2023
+ "learning_rate": 1.703276949062985e-05,
2024
+ "loss": 0.2259,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.327970297029703,
2029
+ "grad_norm": 5.136421203613281,
2030
+ "learning_rate": 1.700198303616944e-05,
2031
+ "loss": 0.2132,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.3292079207920792,
2036
+ "grad_norm": 4.810065746307373,
2037
+ "learning_rate": 1.6971065828817424e-05,
2038
+ "loss": 0.1883,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.33044554455445546,
2043
+ "grad_norm": 4.666658878326416,
2044
+ "learning_rate": 1.6940018445912275e-05,
2045
+ "loss": 0.2087,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.3316831683168317,
2050
+ "grad_norm": 4.813300132751465,
2051
+ "learning_rate": 1.690884146722334e-05,
2052
+ "loss": 0.2631,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.33292079207920794,
2057
+ "grad_norm": 9.478407859802246,
2058
+ "learning_rate": 1.687753547493999e-05,
2059
+ "loss": 0.2618,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.3341584158415842,
2064
+ "grad_norm": 4.668523788452148,
2065
+ "learning_rate": 1.684610105366076e-05,
2066
+ "loss": 0.2025,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.3353960396039604,
2071
+ "grad_norm": 10.10991096496582,
2072
+ "learning_rate": 1.6814538790382432e-05,
2073
+ "loss": 0.2893,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.33663366336633666,
2078
+ "grad_norm": 6.124392509460449,
2079
+ "learning_rate": 1.6782849274489055e-05,
2080
+ "loss": 0.2382,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.3378712871287129,
2085
+ "grad_norm": 4.633864402770996,
2086
+ "learning_rate": 1.6751033097740978e-05,
2087
+ "loss": 0.1991,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.33910891089108913,
2092
+ "grad_norm": 4.003640174865723,
2093
+ "learning_rate": 1.6719090854263752e-05,
2094
+ "loss": 0.1811,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.34034653465346537,
2099
+ "grad_norm": 5.303317070007324,
2100
+ "learning_rate": 1.6687023140537082e-05,
2101
+ "loss": 0.3266,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.3415841584158416,
2106
+ "grad_norm": 4.467435836791992,
2107
+ "learning_rate": 1.6654830555383648e-05,
2108
+ "loss": 0.2174,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.34282178217821785,
2113
+ "grad_norm": 4.210566520690918,
2114
+ "learning_rate": 1.662251369995795e-05,
2115
+ "loss": 0.1746,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.34405940594059403,
2120
+ "grad_norm": 3.8887202739715576,
2121
+ "learning_rate": 1.6590073177735066e-05,
2122
+ "loss": 0.19,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.34529702970297027,
2127
+ "grad_norm": 4.348226547241211,
2128
+ "learning_rate": 1.6557509594499405e-05,
2129
+ "loss": 0.229,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.3465346534653465,
2134
+ "grad_norm": 3.498028039932251,
2135
+ "learning_rate": 1.6524823558333362e-05,
2136
+ "loss": 0.1769,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.3465346534653465,
2141
+ "eval_accuracy": 0.8370288248337029,
2142
+ "eval_f1": 0.6440677966101694,
2143
+ "eval_loss": 0.33291730284690857,
2144
+ "eval_precision": 0.83125,
2145
+ "eval_recall": 0.525691699604743,
2146
+ "eval_runtime": 48.169,
2147
+ "eval_samples_per_second": 5.73,
2148
+ "eval_steps_per_second": 0.187,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.34777227722772275,
2153
+ "grad_norm": 5.36956262588501,
2154
+ "learning_rate": 1.6492015679605994e-05,
2155
+ "loss": 0.2361,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.349009900990099,
2160
+ "grad_norm": 5.6981401443481445,
2161
+ "learning_rate": 1.6459086570961594e-05,
2162
+ "loss": 0.1696,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.3502475247524752,
2167
+ "grad_norm": 5.104677677154541,
2168
+ "learning_rate": 1.6426036847308287e-05,
2169
+ "loss": 0.2587,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.35148514851485146,
2174
+ "grad_norm": 4.432884216308594,
2175
+ "learning_rate": 1.6392867125806504e-05,
2176
+ "loss": 0.2231,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.3527227722772277,
2181
+ "grad_norm": 8.529413223266602,
2182
+ "learning_rate": 1.6359578025857495e-05,
2183
+ "loss": 0.3018,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.35396039603960394,
2188
+ "grad_norm": 3.8591082096099854,
2189
+ "learning_rate": 1.6326170169091735e-05,
2190
+ "loss": 0.2339,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.3551980198019802,
2195
+ "grad_norm": 2.877532482147217,
2196
+ "learning_rate": 1.6292644179357337e-05,
2197
+ "loss": 0.168,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.3564356435643564,
2202
+ "grad_norm": 4.591522693634033,
2203
+ "learning_rate": 1.6259000682708384e-05,
2204
+ "loss": 0.1687,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.35767326732673266,
2209
+ "grad_norm": 4.433895111083984,
2210
+ "learning_rate": 1.622524030739326e-05,
2211
+ "loss": 0.2028,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.3589108910891089,
2216
+ "grad_norm": 5.059347629547119,
2217
+ "learning_rate": 1.6191363683842883e-05,
2218
+ "loss": 0.2286,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.36014851485148514,
2223
+ "grad_norm": 3.7166552543640137,
2224
+ "learning_rate": 1.615737144465898e-05,
2225
+ "loss": 0.1848,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.3613861386138614,
2230
+ "grad_norm": 4.245189189910889,
2231
+ "learning_rate": 1.6123264224602245e-05,
2232
+ "loss": 0.2474,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.3626237623762376,
2237
+ "grad_norm": 6.487268447875977,
2238
+ "learning_rate": 1.608904266058047e-05,
2239
+ "loss": 0.2036,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.36386138613861385,
2244
+ "grad_norm": 3.333557605743408,
2245
+ "learning_rate": 1.605470739163669e-05,
2246
+ "loss": 0.1859,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.3650990099009901,
2251
+ "grad_norm": 3.7823169231414795,
2252
+ "learning_rate": 1.6020259058937228e-05,
2253
+ "loss": 0.1713,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.36633663366336633,
2258
+ "grad_norm": 3.356194496154785,
2259
+ "learning_rate": 1.5985698305759713e-05,
2260
+ "loss": 0.1774,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.36757425742574257,
2265
+ "grad_norm": 4.501846790313721,
2266
+ "learning_rate": 1.59510257774811e-05,
2267
+ "loss": 0.1836,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.3688118811881188,
2272
+ "grad_norm": 9.250550270080566,
2273
+ "learning_rate": 1.591624212156558e-05,
2274
+ "loss": 0.3101,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.37004950495049505,
2279
+ "grad_norm": 3.5429160594940186,
2280
+ "learning_rate": 1.5881347987552517e-05,
2281
+ "loss": 0.1918,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.3712871287128713,
2286
+ "grad_norm": 6.185944080352783,
2287
+ "learning_rate": 1.5846344027044307e-05,
2288
+ "loss": 0.1984,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.3712871287128713,
2293
+ "eval_accuracy": 0.8536585365853658,
2294
+ "eval_f1": 0.6826923076923077,
2295
+ "eval_loss": 0.3210514485836029,
2296
+ "eval_precision": 0.8711656441717791,
2297
+ "eval_recall": 0.5612648221343873,
2298
+ "eval_runtime": 48.5015,
2299
+ "eval_samples_per_second": 5.691,
2300
+ "eval_steps_per_second": 0.186,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.14412078544978e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null