mtzig commited on
Commit
0ee3d1d
·
verified ·
1 Parent(s): fec9711

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e408bfee6bc720f2626f42236cb7ed47eed15851394e64745280cda47346a24
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0828710b640dbf68298e27271b529b25b9633dd7711dad19568b25686b79955f
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eff316f99046dc065f6573097972dede0370afbf132b27b8c0122d95e707ef46
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c7ac02ae4e3a773215b98acf254fe48c4e7de8555cfc804e4c8564253462d5
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff619163df21cd3fce301b8caa1204b04f12d8929b430f9ba8a93cc2b633db21
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fe68aa5543c7e1858414222033d21efd3626ac60315b4e8919cae752773097c
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9f0b1099d24eb7f394ea4f3fe171409b994687feddd6582b56703c5558366fc
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b26a714efa8d1e7798d4d1a8c5ead91c9c253836ae76c8074e3b2d998b12ff
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddc7713c77dfea2848b11f758cffc94b65fdb54736e77c0647082559d10aa06b
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c95f08281fb03852e4519ff6aa382f5c90ec236a0542a483b82c72cfa5ad848e
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6610841a05b9998513c9700bc4bead2bfbd262e59da6e4197d08b8fa080a9641
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae344d3faf7510fb590a7e7eaf2cfc188c6927842e743a0272a1df1c56f789d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f09233c56315737a8a0656ed9c80d92a6963808314b0fe48bf44cb8c6799ef3a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d105c6f274366380374e805d8adf2ea780c3fcbf468ed3da0d30d931fdfebac8
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bebfedc8c1e6754606faa59b7c45e93ee3e4c7ec2913e3893c4695781b7892e7
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94b26738ea842513efa4609b1427f3c23b0d37d48ad0efdbd4bd1f4842bfb006
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f2fd9d1d3847bee68df39de96a06913e37dc3cacd6dcaa01e654f56e2f4eb49
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e68dcb3ef0e8b7c7d625ab213042156e023f2a245865d688db9dd4fe92f207dc
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:584d56bb430fe8df3c24eeab1822b6d753b2090cb92990956e81b8f8e3c6e416
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b214c4a5ffd153814b8b271cfd3ef9fa8c2be4640619c8e4854dc05d7d1c97
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:283174a53dfb12f541f1223f29dfd78957ef99fb6b3f708ac21ff4aa6e7733c3
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a5e4dab35f0ee0b3a7f32c0e07341ebe72be8848c24cfaf9766b3fa46c7222
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4f93a55b6907505dff041e23ee75d98392142bc1e2e39401947dac1e4fb011a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b62411f66602767298f963adc60824709fbed6ee64305b82501b466a1c44c9e
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75ef6331ac4b2e8cf5bcc3f43391a2f41a0430eee842180c387f3d81fdad2fdc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d986f9845c2c79697397af5a78c52fea0f5eb71ebc653676ec6d3ab1848325
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.13531799729364005,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 5.74,
1540
  "eval_steps_per_second": 0.193,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.053734571520819e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2029769959404601,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 5.74,
1540
  "eval_steps_per_second": 0.193,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.13599458728010824,
1545
+ "grad_norm": 2.476130247116089,
1546
+ "learning_rate": 1.9921738005215687e-05,
1547
+ "loss": 0.0617,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.13667117726657646,
1552
+ "grad_norm": 7.735743045806885,
1553
+ "learning_rate": 1.9918760904111818e-05,
1554
+ "loss": 0.252,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.13734776725304465,
1559
+ "grad_norm": 3.769490957260132,
1560
+ "learning_rate": 1.991572846115666e-05,
1561
+ "loss": 0.1439,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.13802435723951287,
1566
+ "grad_norm": 4.381724834442139,
1567
+ "learning_rate": 1.9912640693269754e-05,
1568
+ "loss": 0.2143,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.13870094722598106,
1573
+ "grad_norm": 3.511615037918091,
1574
+ "learning_rate": 1.990949761767935e-05,
1575
+ "loss": 0.1747,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.13937753721244925,
1580
+ "grad_norm": 5.647243499755859,
1581
+ "learning_rate": 1.9906299251922273e-05,
1582
+ "loss": 0.1904,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.14005412719891747,
1587
+ "grad_norm": 3.9160757064819336,
1588
+ "learning_rate": 1.9903045613843844e-05,
1589
+ "loss": 0.1816,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.14073071718538566,
1594
+ "grad_norm": 5.033432483673096,
1595
+ "learning_rate": 1.9899736721597787e-05,
1596
+ "loss": 0.141,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.14140730717185385,
1601
+ "grad_norm": 4.042255401611328,
1602
+ "learning_rate": 1.9896372593646095e-05,
1603
+ "loss": 0.2043,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.14208389715832206,
1608
+ "grad_norm": 7.3899149894714355,
1609
+ "learning_rate": 1.989295324875897e-05,
1610
+ "loss": 0.2182,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.14276048714479025,
1615
+ "grad_norm": 9.090564727783203,
1616
+ "learning_rate": 1.9889478706014687e-05,
1617
+ "loss": 0.2333,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.14343707713125844,
1622
+ "grad_norm": 4.3934526443481445,
1623
+ "learning_rate": 1.9885948984799502e-05,
1624
+ "loss": 0.2836,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.14411366711772666,
1629
+ "grad_norm": 3.89416766166687,
1630
+ "learning_rate": 1.9882364104807536e-05,
1631
+ "loss": 0.2256,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.14479025710419485,
1636
+ "grad_norm": 6.6705241203308105,
1637
+ "learning_rate": 1.987872408604068e-05,
1638
+ "loss": 0.2007,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.14546684709066307,
1643
+ "grad_norm": 9.201639175415039,
1644
+ "learning_rate": 1.9875028948808457e-05,
1645
+ "loss": 0.217,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.14614343707713126,
1650
+ "grad_norm": 3.3696377277374268,
1651
+ "learning_rate": 1.9871278713727932e-05,
1652
+ "loss": 0.1132,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.14682002706359945,
1657
+ "grad_norm": 3.9588944911956787,
1658
+ "learning_rate": 1.9867473401723595e-05,
1659
+ "loss": 0.2086,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.14749661705006767,
1664
+ "grad_norm": 5.10556173324585,
1665
+ "learning_rate": 1.9863613034027224e-05,
1666
+ "loss": 0.1367,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.14817320703653586,
1671
+ "grad_norm": 3.8839104175567627,
1672
+ "learning_rate": 1.9859697632177796e-05,
1673
+ "loss": 0.1882,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.14884979702300405,
1678
+ "grad_norm": 3.7395753860473633,
1679
+ "learning_rate": 1.985572721802134e-05,
1680
+ "loss": 0.1228,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.14884979702300405,
1685
+ "eval_accuracy": 0.763302752293578,
1686
+ "eval_f1": 0.3316062176165803,
1687
+ "eval_loss": 0.49911028146743774,
1688
+ "eval_precision": 0.7272727272727273,
1689
+ "eval_recall": 0.21476510067114093,
1690
+ "eval_runtime": 53.4724,
1691
+ "eval_samples_per_second": 5.573,
1692
+ "eval_steps_per_second": 0.187,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.14952638700947227,
1697
+ "grad_norm": 2.52254056930542,
1698
+ "learning_rate": 1.9851701813710838e-05,
1699
+ "loss": 0.1429,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.15020297699594046,
1704
+ "grad_norm": 2.212614059448242,
1705
+ "learning_rate": 1.9847621441706076e-05,
1706
+ "loss": 0.0924,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.15087956698240865,
1711
+ "grad_norm": 5.361288547515869,
1712
+ "learning_rate": 1.9843486124773546e-05,
1713
+ "loss": 0.1915,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.15155615696887687,
1718
+ "grad_norm": 5.06777286529541,
1719
+ "learning_rate": 1.98392958859863e-05,
1720
+ "loss": 0.2049,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.15223274695534506,
1725
+ "grad_norm": 2.5112369060516357,
1726
+ "learning_rate": 1.9835050748723826e-05,
1727
+ "loss": 0.1132,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.15290933694181327,
1732
+ "grad_norm": 3.829697847366333,
1733
+ "learning_rate": 1.9830750736671923e-05,
1734
+ "loss": 0.1766,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.15358592692828146,
1739
+ "grad_norm": 6.517053604125977,
1740
+ "learning_rate": 1.982639587382256e-05,
1741
+ "loss": 0.2742,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.15426251691474965,
1746
+ "grad_norm": 3.8287339210510254,
1747
+ "learning_rate": 1.9821986184473757e-05,
1748
+ "loss": 0.1686,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.15493910690121787,
1753
+ "grad_norm": 3.6989524364471436,
1754
+ "learning_rate": 1.981752169322942e-05,
1755
+ "loss": 0.1286,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.15561569688768606,
1760
+ "grad_norm": 4.2301788330078125,
1761
+ "learning_rate": 1.981300242499924e-05,
1762
+ "loss": 0.1242,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.15629228687415425,
1767
+ "grad_norm": 5.749031066894531,
1768
+ "learning_rate": 1.9808428404998532e-05,
1769
+ "loss": 0.2348,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.15696887686062247,
1774
+ "grad_norm": 4.1858744621276855,
1775
+ "learning_rate": 1.9803799658748096e-05,
1776
+ "loss": 0.1809,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.15764546684709066,
1781
+ "grad_norm": 2.808894157409668,
1782
+ "learning_rate": 1.9799116212074077e-05,
1783
+ "loss": 0.1228,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.15832205683355885,
1788
+ "grad_norm": 4.898924350738525,
1789
+ "learning_rate": 1.9794378091107834e-05,
1790
+ "loss": 0.1964,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.15899864682002707,
1795
+ "grad_norm": 4.328680038452148,
1796
+ "learning_rate": 1.978958532228576e-05,
1797
+ "loss": 0.1566,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.15967523680649526,
1802
+ "grad_norm": 4.020467758178711,
1803
+ "learning_rate": 1.978473793234918e-05,
1804
+ "loss": 0.2254,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.16035182679296348,
1809
+ "grad_norm": 2.9529521465301514,
1810
+ "learning_rate": 1.977983594834416e-05,
1811
+ "loss": 0.1425,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.16102841677943167,
1816
+ "grad_norm": 3.5832724571228027,
1817
+ "learning_rate": 1.9774879397621387e-05,
1818
+ "loss": 0.1848,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.16170500676589986,
1823
+ "grad_norm": 6.061310768127441,
1824
+ "learning_rate": 1.9769868307835996e-05,
1825
+ "loss": 0.1344,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.16238159675236807,
1830
+ "grad_norm": 4.559755325317383,
1831
+ "learning_rate": 1.9764802706947423e-05,
1832
+ "loss": 0.1678,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.16238159675236807,
1837
+ "eval_accuracy": 0.763302752293578,
1838
+ "eval_f1": 0.32105263157894737,
1839
+ "eval_loss": 0.5306172966957092,
1840
+ "eval_precision": 0.7439024390243902,
1841
+ "eval_recall": 0.20469798657718122,
1842
+ "eval_runtime": 52.6664,
1843
+ "eval_samples_per_second": 5.658,
1844
+ "eval_steps_per_second": 0.19,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.16305818673883626,
1849
+ "grad_norm": 4.202253818511963,
1850
+ "learning_rate": 1.975968262321925e-05,
1851
+ "loss": 0.1828,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.16373477672530445,
1856
+ "grad_norm": 4.026851654052734,
1857
+ "learning_rate": 1.9754508085219057e-05,
1858
+ "loss": 0.2173,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.16441136671177267,
1863
+ "grad_norm": 3.136077404022217,
1864
+ "learning_rate": 1.9749279121818235e-05,
1865
+ "loss": 0.1614,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.16508795669824086,
1870
+ "grad_norm": 3.726810932159424,
1871
+ "learning_rate": 1.974399576219186e-05,
1872
+ "loss": 0.1667,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.16576454668470908,
1877
+ "grad_norm": 5.430721282958984,
1878
+ "learning_rate": 1.9738658035818495e-05,
1879
+ "loss": 0.2158,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.16644113667117727,
1884
+ "grad_norm": 2.940300226211548,
1885
+ "learning_rate": 1.973326597248006e-05,
1886
+ "loss": 0.1664,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.16711772665764546,
1891
+ "grad_norm": 3.436058282852173,
1892
+ "learning_rate": 1.972781960226163e-05,
1893
+ "loss": 0.1944,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.16779431664411368,
1898
+ "grad_norm": 3.708514928817749,
1899
+ "learning_rate": 1.9722318955551307e-05,
1900
+ "loss": 0.1869,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.16847090663058187,
1905
+ "grad_norm": 5.172970771789551,
1906
+ "learning_rate": 1.971676406304001e-05,
1907
+ "loss": 0.219,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.16914749661705006,
1912
+ "grad_norm": 4.198179721832275,
1913
+ "learning_rate": 1.9711154955721338e-05,
1914
+ "loss": 0.1606,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.16982408660351828,
1919
+ "grad_norm": 4.068305969238281,
1920
+ "learning_rate": 1.9705491664891368e-05,
1921
+ "loss": 0.1952,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.17050067658998647,
1926
+ "grad_norm": 4.898702144622803,
1927
+ "learning_rate": 1.969977422214851e-05,
1928
+ "loss": 0.1904,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.17117726657645466,
1933
+ "grad_norm": 3.666128158569336,
1934
+ "learning_rate": 1.9694002659393306e-05,
1935
+ "loss": 0.177,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.17185385656292287,
1940
+ "grad_norm": 3.7936861515045166,
1941
+ "learning_rate": 1.968817700882826e-05,
1942
+ "loss": 0.1003,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.17253044654939106,
1947
+ "grad_norm": 3.3366808891296387,
1948
+ "learning_rate": 1.9682297302957666e-05,
1949
+ "loss": 0.1729,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.17320703653585928,
1954
+ "grad_norm": 4.625013828277588,
1955
+ "learning_rate": 1.9676363574587414e-05,
1956
+ "loss": 0.2212,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.17388362652232747,
1961
+ "grad_norm": 4.048298358917236,
1962
+ "learning_rate": 1.9670375856824823e-05,
1963
+ "loss": 0.1301,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.17456021650879566,
1968
+ "grad_norm": 3.388268232345581,
1969
+ "learning_rate": 1.966433418307843e-05,
1970
+ "loss": 0.091,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.17523680649526388,
1975
+ "grad_norm": 3.287910223007202,
1976
+ "learning_rate": 1.9658238587057832e-05,
1977
+ "loss": 0.1748,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.17591339648173207,
1982
+ "grad_norm": 4.814307689666748,
1983
+ "learning_rate": 1.9652089102773487e-05,
1984
+ "loss": 0.2111,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.17591339648173207,
1989
+ "eval_accuracy": 0.763302752293578,
1990
+ "eval_f1": 0.29120879120879123,
1991
+ "eval_loss": 0.5758374333381653,
1992
+ "eval_precision": 0.803030303030303,
1993
+ "eval_recall": 0.17785234899328858,
1994
+ "eval_runtime": 52.5901,
1995
+ "eval_samples_per_second": 5.666,
1996
+ "eval_steps_per_second": 0.19,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.17658998646820026,
2001
+ "grad_norm": 6.4831461906433105,
2002
+ "learning_rate": 1.9645885764536522e-05,
2003
+ "loss": 0.2182,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.17726657645466848,
2008
+ "grad_norm": 6.477516174316406,
2009
+ "learning_rate": 1.9639628606958535e-05,
2010
+ "loss": 0.2462,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.17794316644113667,
2015
+ "grad_norm": 3.73384690284729,
2016
+ "learning_rate": 1.9633317664951418e-05,
2017
+ "loss": 0.1425,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.17861975642760486,
2022
+ "grad_norm": 4.063915252685547,
2023
+ "learning_rate": 1.962695297372715e-05,
2024
+ "loss": 0.1388,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.17929634641407308,
2029
+ "grad_norm": 4.379425048828125,
2030
+ "learning_rate": 1.962053456879761e-05,
2031
+ "loss": 0.1929,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.17997293640054127,
2036
+ "grad_norm": 3.696601629257202,
2037
+ "learning_rate": 1.9614062485974364e-05,
2038
+ "loss": 0.1795,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.18064952638700948,
2043
+ "grad_norm": 4.814270973205566,
2044
+ "learning_rate": 1.9607536761368484e-05,
2045
+ "loss": 0.1906,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.18132611637347767,
2050
+ "grad_norm": 4.517858028411865,
2051
+ "learning_rate": 1.960095743139033e-05,
2052
+ "loss": 0.1902,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.18200270635994586,
2057
+ "grad_norm": 4.473437309265137,
2058
+ "learning_rate": 1.9594324532749353e-05,
2059
+ "loss": 0.1581,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.18267929634641408,
2064
+ "grad_norm": 6.359562873840332,
2065
+ "learning_rate": 1.95876381024539e-05,
2066
+ "loss": 0.2641,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.18335588633288227,
2071
+ "grad_norm": 5.020825386047363,
2072
+ "learning_rate": 1.958089817781099e-05,
2073
+ "loss": 0.1734,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.18403247631935046,
2078
+ "grad_norm": 5.725416660308838,
2079
+ "learning_rate": 1.9574104796426124e-05,
2080
+ "loss": 0.2771,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.18470906630581868,
2085
+ "grad_norm": 3.2466204166412354,
2086
+ "learning_rate": 1.956725799620305e-05,
2087
+ "loss": 0.1493,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.18538565629228687,
2092
+ "grad_norm": 4.315434455871582,
2093
+ "learning_rate": 1.9560357815343577e-05,
2094
+ "loss": 0.1879,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.18606224627875506,
2099
+ "grad_norm": 4.0321245193481445,
2100
+ "learning_rate": 1.9553404292347356e-05,
2101
+ "loss": 0.1276,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.18673883626522328,
2106
+ "grad_norm": 3.7112905979156494,
2107
+ "learning_rate": 1.9546397466011654e-05,
2108
+ "loss": 0.173,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.18741542625169147,
2113
+ "grad_norm": 5.85778284072876,
2114
+ "learning_rate": 1.9539337375431144e-05,
2115
+ "loss": 0.3178,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.1880920162381597,
2120
+ "grad_norm": 5.455870151519775,
2121
+ "learning_rate": 1.9532224059997693e-05,
2122
+ "loss": 0.1971,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.18876860622462788,
2127
+ "grad_norm": 3.5191891193389893,
2128
+ "learning_rate": 1.9525057559400134e-05,
2129
+ "loss": 0.165,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.18944519621109607,
2134
+ "grad_norm": 4.080350399017334,
2135
+ "learning_rate": 1.9517837913624048e-05,
2136
+ "loss": 0.115,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.18944519621109607,
2141
+ "eval_accuracy": 0.7706422018348624,
2142
+ "eval_f1": 0.3315508021390374,
2143
+ "eval_loss": 0.49153777956962585,
2144
+ "eval_precision": 0.8157894736842105,
2145
+ "eval_recall": 0.2080536912751678,
2146
+ "eval_runtime": 53.3463,
2147
+ "eval_samples_per_second": 5.586,
2148
+ "eval_steps_per_second": 0.187,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.19012178619756429,
2153
+ "grad_norm": 3.4508748054504395,
2154
+ "learning_rate": 1.9510565162951538e-05,
2155
+ "loss": 0.1461,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.19079837618403248,
2160
+ "grad_norm": 5.258754253387451,
2161
+ "learning_rate": 1.9503239347961006e-05,
2162
+ "loss": 0.2396,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.19147496617050067,
2167
+ "grad_norm": 5.140385627746582,
2168
+ "learning_rate": 1.9495860509526935e-05,
2169
+ "loss": 0.1444,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.19215155615696888,
2174
+ "grad_norm": 1.9280897378921509,
2175
+ "learning_rate": 1.948842868881964e-05,
2176
+ "loss": 0.1426,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.19282814614343707,
2181
+ "grad_norm": 2.6845431327819824,
2182
+ "learning_rate": 1.948094392730506e-05,
2183
+ "loss": 0.1702,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.19350473612990526,
2188
+ "grad_norm": 2.7291038036346436,
2189
+ "learning_rate": 1.9473406266744518e-05,
2190
+ "loss": 0.1525,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.19418132611637348,
2195
+ "grad_norm": 2.8851161003112793,
2196
+ "learning_rate": 1.9465815749194482e-05,
2197
+ "loss": 0.1419,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.19485791610284167,
2202
+ "grad_norm": 3.973231554031372,
2203
+ "learning_rate": 1.9458172417006347e-05,
2204
+ "loss": 0.1782,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.1955345060893099,
2209
+ "grad_norm": 5.708676338195801,
2210
+ "learning_rate": 1.9450476312826178e-05,
2211
+ "loss": 0.1396,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.19621109607577808,
2216
+ "grad_norm": 3.4198830127716064,
2217
+ "learning_rate": 1.9442727479594486e-05,
2218
+ "loss": 0.1762,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.19688768606224627,
2223
+ "grad_norm": 5.898075103759766,
2224
+ "learning_rate": 1.9434925960545978e-05,
2225
+ "loss": 0.213,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.1975642760487145,
2230
+ "grad_norm": 5.121380805969238,
2231
+ "learning_rate": 1.9427071799209335e-05,
2232
+ "loss": 0.2684,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.19824086603518268,
2237
+ "grad_norm": 5.1736931800842285,
2238
+ "learning_rate": 1.941916503940694e-05,
2239
+ "loss": 0.2272,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.19891745602165087,
2244
+ "grad_norm": 3.988576650619507,
2245
+ "learning_rate": 1.941120572525467e-05,
2246
+ "loss": 0.2007,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.19959404600811909,
2251
+ "grad_norm": 6.444464683532715,
2252
+ "learning_rate": 1.9403193901161614e-05,
2253
+ "loss": 0.2243,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.20027063599458728,
2258
+ "grad_norm": 3.4448323249816895,
2259
+ "learning_rate": 1.9395129611829844e-05,
2260
+ "loss": 0.1175,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.2009472259810555,
2265
+ "grad_norm": 7.464962005615234,
2266
+ "learning_rate": 1.9387012902254165e-05,
2267
+ "loss": 0.2362,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.20162381596752368,
2272
+ "grad_norm": 4.778014183044434,
2273
+ "learning_rate": 1.9378843817721856e-05,
2274
+ "loss": 0.1657,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.20230040595399187,
2279
+ "grad_norm": 4.121883392333984,
2280
+ "learning_rate": 1.937062240381243e-05,
2281
+ "loss": 0.1339,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.2029769959404601,
2286
+ "grad_norm": 5.5182576179504395,
2287
+ "learning_rate": 1.9362348706397374e-05,
2288
+ "loss": 0.1785,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.2029769959404601,
2293
+ "eval_accuracy": 0.773394495412844,
2294
+ "eval_f1": 0.35170603674540685,
2295
+ "eval_loss": 0.5283112525939941,
2296
+ "eval_precision": 0.8072289156626506,
2297
+ "eval_recall": 0.22483221476510068,
2298
+ "eval_runtime": 51.5451,
2299
+ "eval_samples_per_second": 5.781,
2300
+ "eval_steps_per_second": 0.194,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.054635921978163e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null