kiritan commited on
Commit
e858f64
·
verified ·
1 Parent(s): 26e7f22

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b40c29506ba51102a5e99f143966af19218dad4afdf7f5a54e3c91a9ba8aaa6
3
+ size 5117197489
last-checkpoint/global_step7000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d472475f1ab585d49c8a44e0b9d581931a511d04b94f5f0d50d7c89a6e10a7
3
+ size 859127933
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step5000
 
1
+ global_step7000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e917f6578a37f477ce51d824ef2c22355d57ba680883e19ec30d3b97940c7e3b
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:856733172381a37b6de12c25512bfa5cc33814241a1986b18ae46a3c6cd69ce1
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe58c8283b537c6ee9a4dd56ebbea21d90b446075eea802c036a3707078dd25c
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56a8ea7c15005d31ade663058f08a1d5a4619da6c77df5179c75f15bb9cc3f05
3
  size 14709
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fdaaa58d591c2d03b0ec95bb2576cb7c7885945b5e85c3aae63ede0ea16cfc5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34a83060f11df4fe46a27d45e8744a4c0e7bb60df156e5d496780133906eacd7
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 86.11975483262611,
3
- "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-5000",
4
- "epoch": 5.506607929515418,
5
  "eval_steps": 1000,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1457,6 +1457,586 @@
1457
  "eval_steps_per_second": 1.999,
1458
  "eval_wer": 86.11975483262611,
1459
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1460
  }
1461
  ],
1462
  "logging_steps": 25,
@@ -1476,7 +2056,7 @@
1476
  "attributes": {}
1477
  }
1478
  },
1479
- "total_flos": 8.590715932450488e+19,
1480
  "train_batch_size": 4,
1481
  "trial_name": null,
1482
  "trial_params": null
 
1
  {
2
+ "best_metric": 84.13012729844414,
3
+ "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
+ "epoch": 7.709251101321586,
5
  "eval_steps": 1000,
6
+ "global_step": 7000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1457
  "eval_steps_per_second": 1.999,
1458
  "eval_wer": 86.11975483262611,
1459
  "step": 5000
1460
+ },
1461
+ {
1462
+ "epoch": 5.534140969162996,
1463
+ "grad_norm": 0.6520294547080994,
1464
+ "learning_rate": 1.535897435897436e-05,
1465
+ "loss": 0.1639,
1466
+ "step": 5025
1467
+ },
1468
+ {
1469
+ "epoch": 5.561674008810573,
1470
+ "grad_norm": 0.6221819519996643,
1471
+ "learning_rate": 1.5333333333333334e-05,
1472
+ "loss": 0.1851,
1473
+ "step": 5050
1474
+ },
1475
+ {
1476
+ "epoch": 5.5892070484581495,
1477
+ "grad_norm": 0.5925490856170654,
1478
+ "learning_rate": 1.5307692307692308e-05,
1479
+ "loss": 0.1733,
1480
+ "step": 5075
1481
+ },
1482
+ {
1483
+ "epoch": 5.616740088105727,
1484
+ "grad_norm": 1.0411094427108765,
1485
+ "learning_rate": 1.5282051282051282e-05,
1486
+ "loss": 0.1502,
1487
+ "step": 5100
1488
+ },
1489
+ {
1490
+ "epoch": 5.644273127753304,
1491
+ "grad_norm": 0.8545799255371094,
1492
+ "learning_rate": 1.5256410256410257e-05,
1493
+ "loss": 0.1681,
1494
+ "step": 5125
1495
+ },
1496
+ {
1497
+ "epoch": 5.671806167400881,
1498
+ "grad_norm": 0.5849359631538391,
1499
+ "learning_rate": 1.523076923076923e-05,
1500
+ "loss": 0.1778,
1501
+ "step": 5150
1502
+ },
1503
+ {
1504
+ "epoch": 5.6993392070484585,
1505
+ "grad_norm": 0.5703755617141724,
1506
+ "learning_rate": 1.5205128205128206e-05,
1507
+ "loss": 0.1328,
1508
+ "step": 5175
1509
+ },
1510
+ {
1511
+ "epoch": 5.726872246696035,
1512
+ "grad_norm": 0.7638615965843201,
1513
+ "learning_rate": 1.517948717948718e-05,
1514
+ "loss": 0.1929,
1515
+ "step": 5200
1516
+ },
1517
+ {
1518
+ "epoch": 5.754405286343612,
1519
+ "grad_norm": 0.5087124109268188,
1520
+ "learning_rate": 1.5153846153846155e-05,
1521
+ "loss": 0.1545,
1522
+ "step": 5225
1523
+ },
1524
+ {
1525
+ "epoch": 5.78193832599119,
1526
+ "grad_norm": 0.8324174284934998,
1527
+ "learning_rate": 1.5128205128205129e-05,
1528
+ "loss": 0.1975,
1529
+ "step": 5250
1530
+ },
1531
+ {
1532
+ "epoch": 5.809471365638767,
1533
+ "grad_norm": 0.4413852095603943,
1534
+ "learning_rate": 1.5102564102564104e-05,
1535
+ "loss": 0.1663,
1536
+ "step": 5275
1537
+ },
1538
+ {
1539
+ "epoch": 5.8370044052863435,
1540
+ "grad_norm": 0.708247184753418,
1541
+ "learning_rate": 1.5076923076923078e-05,
1542
+ "loss": 0.1543,
1543
+ "step": 5300
1544
+ },
1545
+ {
1546
+ "epoch": 5.864537444933921,
1547
+ "grad_norm": 0.689794659614563,
1548
+ "learning_rate": 1.5051282051282053e-05,
1549
+ "loss": 0.1621,
1550
+ "step": 5325
1551
+ },
1552
+ {
1553
+ "epoch": 5.892070484581498,
1554
+ "grad_norm": 0.5020695328712463,
1555
+ "learning_rate": 1.5025641025641027e-05,
1556
+ "loss": 0.1774,
1557
+ "step": 5350
1558
+ },
1559
+ {
1560
+ "epoch": 5.919603524229075,
1561
+ "grad_norm": 0.4771401882171631,
1562
+ "learning_rate": 1.5000000000000002e-05,
1563
+ "loss": 0.1442,
1564
+ "step": 5375
1565
+ },
1566
+ {
1567
+ "epoch": 5.9471365638766525,
1568
+ "grad_norm": 1.0237714052200317,
1569
+ "learning_rate": 1.4974358974358976e-05,
1570
+ "loss": 0.1629,
1571
+ "step": 5400
1572
+ },
1573
+ {
1574
+ "epoch": 5.974669603524229,
1575
+ "grad_norm": 0.7134143114089966,
1576
+ "learning_rate": 1.494871794871795e-05,
1577
+ "loss": 0.144,
1578
+ "step": 5425
1579
+ },
1580
+ {
1581
+ "epoch": 6.002202643171806,
1582
+ "grad_norm": 0.30046069622039795,
1583
+ "learning_rate": 1.4923076923076925e-05,
1584
+ "loss": 0.1373,
1585
+ "step": 5450
1586
+ },
1587
+ {
1588
+ "epoch": 6.029735682819383,
1589
+ "grad_norm": 0.4057641923427582,
1590
+ "learning_rate": 1.4897435897435898e-05,
1591
+ "loss": 0.1082,
1592
+ "step": 5475
1593
+ },
1594
+ {
1595
+ "epoch": 6.057268722466961,
1596
+ "grad_norm": 0.486996591091156,
1597
+ "learning_rate": 1.4871794871794874e-05,
1598
+ "loss": 0.0848,
1599
+ "step": 5500
1600
+ },
1601
+ {
1602
+ "epoch": 6.084801762114537,
1603
+ "grad_norm": 0.46409764885902405,
1604
+ "learning_rate": 1.4846153846153847e-05,
1605
+ "loss": 0.1075,
1606
+ "step": 5525
1607
+ },
1608
+ {
1609
+ "epoch": 6.112334801762114,
1610
+ "grad_norm": 0.4308403432369232,
1611
+ "learning_rate": 1.4820512820512823e-05,
1612
+ "loss": 0.1193,
1613
+ "step": 5550
1614
+ },
1615
+ {
1616
+ "epoch": 6.139867841409692,
1617
+ "grad_norm": 0.34751376509666443,
1618
+ "learning_rate": 1.4794871794871796e-05,
1619
+ "loss": 0.1139,
1620
+ "step": 5575
1621
+ },
1622
+ {
1623
+ "epoch": 6.167400881057269,
1624
+ "grad_norm": 0.8365034461021423,
1625
+ "learning_rate": 1.4769230769230772e-05,
1626
+ "loss": 0.1273,
1627
+ "step": 5600
1628
+ },
1629
+ {
1630
+ "epoch": 6.1949339207048455,
1631
+ "grad_norm": 0.34338051080703735,
1632
+ "learning_rate": 1.4743589743589745e-05,
1633
+ "loss": 0.0895,
1634
+ "step": 5625
1635
+ },
1636
+ {
1637
+ "epoch": 6.222466960352423,
1638
+ "grad_norm": 0.6777989864349365,
1639
+ "learning_rate": 1.471794871794872e-05,
1640
+ "loss": 0.121,
1641
+ "step": 5650
1642
+ },
1643
+ {
1644
+ "epoch": 6.25,
1645
+ "grad_norm": 0.5982616543769836,
1646
+ "learning_rate": 1.4692307692307694e-05,
1647
+ "loss": 0.1214,
1648
+ "step": 5675
1649
+ },
1650
+ {
1651
+ "epoch": 6.277533039647577,
1652
+ "grad_norm": 0.5918659567832947,
1653
+ "learning_rate": 1.4666666666666666e-05,
1654
+ "loss": 0.0863,
1655
+ "step": 5700
1656
+ },
1657
+ {
1658
+ "epoch": 6.3050660792951545,
1659
+ "grad_norm": 0.35085636377334595,
1660
+ "learning_rate": 1.4641025641025642e-05,
1661
+ "loss": 0.11,
1662
+ "step": 5725
1663
+ },
1664
+ {
1665
+ "epoch": 6.332599118942731,
1666
+ "grad_norm": 0.43525975942611694,
1667
+ "learning_rate": 1.4615384615384615e-05,
1668
+ "loss": 0.0945,
1669
+ "step": 5750
1670
+ },
1671
+ {
1672
+ "epoch": 6.360132158590308,
1673
+ "grad_norm": 0.3799566328525543,
1674
+ "learning_rate": 1.458974358974359e-05,
1675
+ "loss": 0.0984,
1676
+ "step": 5775
1677
+ },
1678
+ {
1679
+ "epoch": 6.387665198237886,
1680
+ "grad_norm": 0.7915482521057129,
1681
+ "learning_rate": 1.4564102564102564e-05,
1682
+ "loss": 0.1154,
1683
+ "step": 5800
1684
+ },
1685
+ {
1686
+ "epoch": 6.415198237885463,
1687
+ "grad_norm": 0.47404220700263977,
1688
+ "learning_rate": 1.453846153846154e-05,
1689
+ "loss": 0.0984,
1690
+ "step": 5825
1691
+ },
1692
+ {
1693
+ "epoch": 6.442731277533039,
1694
+ "grad_norm": 0.3866804838180542,
1695
+ "learning_rate": 1.4512820512820513e-05,
1696
+ "loss": 0.1273,
1697
+ "step": 5850
1698
+ },
1699
+ {
1700
+ "epoch": 6.470264317180617,
1701
+ "grad_norm": 0.4380825459957123,
1702
+ "learning_rate": 1.4487179487179489e-05,
1703
+ "loss": 0.1332,
1704
+ "step": 5875
1705
+ },
1706
+ {
1707
+ "epoch": 6.497797356828194,
1708
+ "grad_norm": 0.4749581515789032,
1709
+ "learning_rate": 1.4461538461538462e-05,
1710
+ "loss": 0.1243,
1711
+ "step": 5900
1712
+ },
1713
+ {
1714
+ "epoch": 6.525330396475771,
1715
+ "grad_norm": 0.820015549659729,
1716
+ "learning_rate": 1.4435897435897438e-05,
1717
+ "loss": 0.1305,
1718
+ "step": 5925
1719
+ },
1720
+ {
1721
+ "epoch": 6.5528634361233475,
1722
+ "grad_norm": 0.5644270181655884,
1723
+ "learning_rate": 1.4410256410256411e-05,
1724
+ "loss": 0.1167,
1725
+ "step": 5950
1726
+ },
1727
+ {
1728
+ "epoch": 6.580396475770925,
1729
+ "grad_norm": 0.4454534649848938,
1730
+ "learning_rate": 1.4384615384615387e-05,
1731
+ "loss": 0.1101,
1732
+ "step": 5975
1733
+ },
1734
+ {
1735
+ "epoch": 6.607929515418502,
1736
+ "grad_norm": 0.5850095748901367,
1737
+ "learning_rate": 1.435897435897436e-05,
1738
+ "loss": 0.104,
1739
+ "step": 6000
1740
+ },
1741
+ {
1742
+ "epoch": 6.607929515418502,
1743
+ "eval_cer": 24.67816078777527,
1744
+ "eval_loss": 0.7848840355873108,
1745
+ "eval_runtime": 1300.262,
1746
+ "eval_samples_per_second": 8.138,
1747
+ "eval_steps_per_second": 2.035,
1748
+ "eval_wer": 84.6016030174446,
1749
+ "step": 6000
1750
+ },
1751
+ {
1752
+ "epoch": 6.635462555066079,
1753
+ "grad_norm": 0.6678868532180786,
1754
+ "learning_rate": 1.4333333333333334e-05,
1755
+ "loss": 0.0875,
1756
+ "step": 6025
1757
+ },
1758
+ {
1759
+ "epoch": 6.6629955947136565,
1760
+ "grad_norm": 0.279801607131958,
1761
+ "learning_rate": 1.430769230769231e-05,
1762
+ "loss": 0.1333,
1763
+ "step": 6050
1764
+ },
1765
+ {
1766
+ "epoch": 6.690528634361233,
1767
+ "grad_norm": 0.8138979077339172,
1768
+ "learning_rate": 1.4282051282051283e-05,
1769
+ "loss": 0.1458,
1770
+ "step": 6075
1771
+ },
1772
+ {
1773
+ "epoch": 6.71806167400881,
1774
+ "grad_norm": 0.6547926068305969,
1775
+ "learning_rate": 1.4256410256410258e-05,
1776
+ "loss": 0.1421,
1777
+ "step": 6100
1778
+ },
1779
+ {
1780
+ "epoch": 6.745594713656388,
1781
+ "grad_norm": 0.6097781658172607,
1782
+ "learning_rate": 1.4230769230769232e-05,
1783
+ "loss": 0.1285,
1784
+ "step": 6125
1785
+ },
1786
+ {
1787
+ "epoch": 6.773127753303965,
1788
+ "grad_norm": 0.4184475839138031,
1789
+ "learning_rate": 1.4205128205128207e-05,
1790
+ "loss": 0.1139,
1791
+ "step": 6150
1792
+ },
1793
+ {
1794
+ "epoch": 6.8006607929515415,
1795
+ "grad_norm": 0.4548538029193878,
1796
+ "learning_rate": 1.4179487179487181e-05,
1797
+ "loss": 0.1214,
1798
+ "step": 6175
1799
+ },
1800
+ {
1801
+ "epoch": 6.828193832599119,
1802
+ "grad_norm": 0.3974076509475708,
1803
+ "learning_rate": 1.4153846153846156e-05,
1804
+ "loss": 0.1051,
1805
+ "step": 6200
1806
+ },
1807
+ {
1808
+ "epoch": 6.855726872246696,
1809
+ "grad_norm": 0.7746002078056335,
1810
+ "learning_rate": 1.412820512820513e-05,
1811
+ "loss": 0.1349,
1812
+ "step": 6225
1813
+ },
1814
+ {
1815
+ "epoch": 6.883259911894273,
1816
+ "grad_norm": 0.3809688687324524,
1817
+ "learning_rate": 1.4102564102564105e-05,
1818
+ "loss": 0.11,
1819
+ "step": 6250
1820
+ },
1821
+ {
1822
+ "epoch": 6.9107929515418505,
1823
+ "grad_norm": 0.39594364166259766,
1824
+ "learning_rate": 1.4076923076923079e-05,
1825
+ "loss": 0.1006,
1826
+ "step": 6275
1827
+ },
1828
+ {
1829
+ "epoch": 6.938325991189427,
1830
+ "grad_norm": 0.5483039617538452,
1831
+ "learning_rate": 1.405128205128205e-05,
1832
+ "loss": 0.1122,
1833
+ "step": 6300
1834
+ },
1835
+ {
1836
+ "epoch": 6.965859030837004,
1837
+ "grad_norm": 0.6642032861709595,
1838
+ "learning_rate": 1.4025641025641026e-05,
1839
+ "loss": 0.1232,
1840
+ "step": 6325
1841
+ },
1842
+ {
1843
+ "epoch": 6.993392070484582,
1844
+ "grad_norm": 0.42328912019729614,
1845
+ "learning_rate": 1.4e-05,
1846
+ "loss": 0.1125,
1847
+ "step": 6350
1848
+ },
1849
+ {
1850
+ "epoch": 7.020925110132159,
1851
+ "grad_norm": 0.6559634804725647,
1852
+ "learning_rate": 1.3974358974358975e-05,
1853
+ "loss": 0.0831,
1854
+ "step": 6375
1855
+ },
1856
+ {
1857
+ "epoch": 7.048458149779735,
1858
+ "grad_norm": 0.5906537175178528,
1859
+ "learning_rate": 1.3948717948717949e-05,
1860
+ "loss": 0.0752,
1861
+ "step": 6400
1862
+ },
1863
+ {
1864
+ "epoch": 7.075991189427313,
1865
+ "grad_norm": 0.5048888921737671,
1866
+ "learning_rate": 1.3923076923076924e-05,
1867
+ "loss": 0.0702,
1868
+ "step": 6425
1869
+ },
1870
+ {
1871
+ "epoch": 7.10352422907489,
1872
+ "grad_norm": 0.5171650648117065,
1873
+ "learning_rate": 1.3897435897435898e-05,
1874
+ "loss": 0.0622,
1875
+ "step": 6450
1876
+ },
1877
+ {
1878
+ "epoch": 7.131057268722467,
1879
+ "grad_norm": 0.608253538608551,
1880
+ "learning_rate": 1.3871794871794873e-05,
1881
+ "loss": 0.0795,
1882
+ "step": 6475
1883
+ },
1884
+ {
1885
+ "epoch": 7.158590308370044,
1886
+ "grad_norm": 0.3569038212299347,
1887
+ "learning_rate": 1.3846153846153847e-05,
1888
+ "loss": 0.0862,
1889
+ "step": 6500
1890
+ },
1891
+ {
1892
+ "epoch": 7.186123348017621,
1893
+ "grad_norm": 0.4823140501976013,
1894
+ "learning_rate": 1.3820512820512822e-05,
1895
+ "loss": 0.0749,
1896
+ "step": 6525
1897
+ },
1898
+ {
1899
+ "epoch": 7.213656387665198,
1900
+ "grad_norm": 0.6069055199623108,
1901
+ "learning_rate": 1.3794871794871796e-05,
1902
+ "loss": 0.0732,
1903
+ "step": 6550
1904
+ },
1905
+ {
1906
+ "epoch": 7.241189427312776,
1907
+ "grad_norm": 0.3300100564956665,
1908
+ "learning_rate": 1.3769230769230771e-05,
1909
+ "loss": 0.0831,
1910
+ "step": 6575
1911
+ },
1912
+ {
1913
+ "epoch": 7.2687224669603525,
1914
+ "grad_norm": 0.5404714941978455,
1915
+ "learning_rate": 1.3743589743589745e-05,
1916
+ "loss": 0.0783,
1917
+ "step": 6600
1918
+ },
1919
+ {
1920
+ "epoch": 7.296255506607929,
1921
+ "grad_norm": 0.6272768974304199,
1922
+ "learning_rate": 1.3717948717948718e-05,
1923
+ "loss": 0.0878,
1924
+ "step": 6625
1925
+ },
1926
+ {
1927
+ "epoch": 7.323788546255507,
1928
+ "grad_norm": 0.4452053904533386,
1929
+ "learning_rate": 1.3692307692307694e-05,
1930
+ "loss": 0.0808,
1931
+ "step": 6650
1932
+ },
1933
+ {
1934
+ "epoch": 7.351321585903084,
1935
+ "grad_norm": 0.3930460810661316,
1936
+ "learning_rate": 1.3666666666666667e-05,
1937
+ "loss": 0.0861,
1938
+ "step": 6675
1939
+ },
1940
+ {
1941
+ "epoch": 7.378854625550661,
1942
+ "grad_norm": 0.31089282035827637,
1943
+ "learning_rate": 1.3641025641025643e-05,
1944
+ "loss": 0.0642,
1945
+ "step": 6700
1946
+ },
1947
+ {
1948
+ "epoch": 7.406387665198238,
1949
+ "grad_norm": 0.219461590051651,
1950
+ "learning_rate": 1.3615384615384616e-05,
1951
+ "loss": 0.0589,
1952
+ "step": 6725
1953
+ },
1954
+ {
1955
+ "epoch": 7.433920704845815,
1956
+ "grad_norm": 0.435345321893692,
1957
+ "learning_rate": 1.3589743589743592e-05,
1958
+ "loss": 0.076,
1959
+ "step": 6750
1960
+ },
1961
+ {
1962
+ "epoch": 7.461453744493392,
1963
+ "grad_norm": 0.5823142528533936,
1964
+ "learning_rate": 1.3564102564102565e-05,
1965
+ "loss": 0.0884,
1966
+ "step": 6775
1967
+ },
1968
+ {
1969
+ "epoch": 7.48898678414097,
1970
+ "grad_norm": 0.6687197685241699,
1971
+ "learning_rate": 1.353846153846154e-05,
1972
+ "loss": 0.0831,
1973
+ "step": 6800
1974
+ },
1975
+ {
1976
+ "epoch": 7.516519823788546,
1977
+ "grad_norm": 0.25778886675834656,
1978
+ "learning_rate": 1.3512820512820514e-05,
1979
+ "loss": 0.0794,
1980
+ "step": 6825
1981
+ },
1982
+ {
1983
+ "epoch": 7.544052863436123,
1984
+ "grad_norm": 0.5225228071212769,
1985
+ "learning_rate": 1.348717948717949e-05,
1986
+ "loss": 0.0772,
1987
+ "step": 6850
1988
+ },
1989
+ {
1990
+ "epoch": 7.5715859030837,
1991
+ "grad_norm": 0.4801703095436096,
1992
+ "learning_rate": 1.3461538461538463e-05,
1993
+ "loss": 0.0717,
1994
+ "step": 6875
1995
+ },
1996
+ {
1997
+ "epoch": 7.599118942731278,
1998
+ "grad_norm": 0.48749440908432007,
1999
+ "learning_rate": 1.3435897435897435e-05,
2000
+ "loss": 0.0952,
2001
+ "step": 6900
2002
+ },
2003
+ {
2004
+ "epoch": 7.6266519823788546,
2005
+ "grad_norm": 0.34919875860214233,
2006
+ "learning_rate": 1.341025641025641e-05,
2007
+ "loss": 0.0755,
2008
+ "step": 6925
2009
+ },
2010
+ {
2011
+ "epoch": 7.654185022026431,
2012
+ "grad_norm": 0.6202211976051331,
2013
+ "learning_rate": 1.3384615384615384e-05,
2014
+ "loss": 0.095,
2015
+ "step": 6950
2016
+ },
2017
+ {
2018
+ "epoch": 7.681718061674009,
2019
+ "grad_norm": 0.3826860189437866,
2020
+ "learning_rate": 1.335897435897436e-05,
2021
+ "loss": 0.095,
2022
+ "step": 6975
2023
+ },
2024
+ {
2025
+ "epoch": 7.709251101321586,
2026
+ "grad_norm": 0.6930757761001587,
2027
+ "learning_rate": 1.3333333333333333e-05,
2028
+ "loss": 0.0641,
2029
+ "step": 7000
2030
+ },
2031
+ {
2032
+ "epoch": 7.709251101321586,
2033
+ "eval_cer": 38.884367154631846,
2034
+ "eval_loss": 0.8057100772857666,
2035
+ "eval_runtime": 1352.106,
2036
+ "eval_samples_per_second": 7.826,
2037
+ "eval_steps_per_second": 1.957,
2038
+ "eval_wer": 84.13012729844414,
2039
+ "step": 7000
2040
  }
2041
  ],
2042
  "logging_steps": 25,
 
2056
  "attributes": {}
2057
  }
2058
  },
2059
+ "total_flos": 1.2027002305430684e+20,
2060
  "train_batch_size": 4,
2061
  "trial_name": null,
2062
  "trial_params": null