ljcamargo commited on
Commit
b2a6159
·
verified ·
1 Parent(s): 1c6a37f

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c62db0277afdc3f2ad6dbafa0dd57f53ea9debb5ee9712f0b1547cf8523f1070
3
  size 3809184360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa628227eb6da8969ca4152626ddd662a6bc995ae11c142530103e8d809877f
3
  size 3809184360
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:994a3b059f463b00db236586003b2652100023cbd4f39b1b1ac679076c611649
3
  size 2458291491
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff0fa64aedf2efc21c7477814281486dfbb3531005e141ae798b948f4eb3eae1
3
  size 2458291491
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:376b730bb310b4f7540caf50ba2d9485c55172240b565241043b8847f1833fe8
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45416cdce28d4e00ea87be782021fe7b8957b5e480b7287f76264062ed6fb579
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:444dae11008b250d18996da8350dc235efbc33e7070670e4ec0778a449b281a5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:255c20227b8fb2aeae459402d3b34aa327a9576ffbb05f5a57e40623d3a94bf9
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9,
6
  "eval_steps": 500,
7
- "global_step": 2250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1583,6 +1583,181 @@
1583
  "learning_rate": 5.141129032258065e-06,
1584
  "loss": 0.2152,
1585
  "step": 2250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1586
  }
1587
  ],
1588
  "logging_steps": 10,
@@ -1597,12 +1772,12 @@
1597
  "should_evaluate": false,
1598
  "should_log": false,
1599
  "should_save": true,
1600
- "should_training_stop": false
1601
  },
1602
  "attributes": {}
1603
  }
1604
  },
1605
- "total_flos": 4.06788487884288e+16,
1606
  "train_batch_size": 2,
1607
  "trial_name": null,
1608
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1583
  "learning_rate": 5.141129032258065e-06,
1584
  "loss": 0.2152,
1585
  "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.904,
1589
+ "grad_norm": 11.215507507324219,
1590
+ "learning_rate": 4.939516129032258e-06,
1591
+ "loss": 0.3383,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.908,
1596
+ "grad_norm": 8.459249496459961,
1597
+ "learning_rate": 4.737903225806452e-06,
1598
+ "loss": 0.301,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.912,
1603
+ "grad_norm": 3.8547234535217285,
1604
+ "learning_rate": 4.536290322580646e-06,
1605
+ "loss": 0.2018,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.916,
1610
+ "grad_norm": 4.500983238220215,
1611
+ "learning_rate": 4.33467741935484e-06,
1612
+ "loss": 0.2804,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.92,
1617
+ "grad_norm": 6.937648773193359,
1618
+ "learning_rate": 4.133064516129033e-06,
1619
+ "loss": 0.2368,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.924,
1624
+ "grad_norm": 9.027796745300293,
1625
+ "learning_rate": 3.931451612903226e-06,
1626
+ "loss": 0.3048,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.928,
1631
+ "grad_norm": 13.112957954406738,
1632
+ "learning_rate": 3.7298387096774197e-06,
1633
+ "loss": 0.2602,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.932,
1638
+ "grad_norm": 8.331156730651855,
1639
+ "learning_rate": 3.5282258064516136e-06,
1640
+ "loss": 0.4963,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.936,
1645
+ "grad_norm": 14.943937301635742,
1646
+ "learning_rate": 3.3266129032258062e-06,
1647
+ "loss": 0.2576,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.94,
1652
+ "grad_norm": 9.540063858032227,
1653
+ "learning_rate": 3.125e-06,
1654
+ "loss": 0.2683,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.944,
1659
+ "grad_norm": 6.233945369720459,
1660
+ "learning_rate": 2.9233870967741936e-06,
1661
+ "loss": 0.6162,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.948,
1666
+ "grad_norm": 5.329911708831787,
1667
+ "learning_rate": 2.721774193548387e-06,
1668
+ "loss": 0.283,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.952,
1673
+ "grad_norm": 5.373264789581299,
1674
+ "learning_rate": 2.5201612903225806e-06,
1675
+ "loss": 0.4888,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.956,
1680
+ "grad_norm": 5.866879463195801,
1681
+ "learning_rate": 2.318548387096774e-06,
1682
+ "loss": 0.2366,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.96,
1687
+ "grad_norm": 6.051980972290039,
1688
+ "learning_rate": 2.1169354838709676e-06,
1689
+ "loss": 0.2216,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.964,
1694
+ "grad_norm": 8.556709289550781,
1695
+ "learning_rate": 1.9153225806451616e-06,
1696
+ "loss": 0.3197,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.968,
1701
+ "grad_norm": 5.628035068511963,
1702
+ "learning_rate": 1.7137096774193548e-06,
1703
+ "loss": 0.2376,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.972,
1708
+ "grad_norm": 15.512683868408203,
1709
+ "learning_rate": 1.5120967741935486e-06,
1710
+ "loss": 0.2198,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.976,
1715
+ "grad_norm": 5.347922325134277,
1716
+ "learning_rate": 1.310483870967742e-06,
1717
+ "loss": 0.2435,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.98,
1722
+ "grad_norm": 7.0029473304748535,
1723
+ "learning_rate": 1.1088709677419356e-06,
1724
+ "loss": 0.202,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.984,
1729
+ "grad_norm": 10.346604347229004,
1730
+ "learning_rate": 9.072580645161292e-07,
1731
+ "loss": 0.2372,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.988,
1736
+ "grad_norm": 7.6713457107543945,
1737
+ "learning_rate": 7.056451612903225e-07,
1738
+ "loss": 0.6436,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.992,
1743
+ "grad_norm": 5.280267715454102,
1744
+ "learning_rate": 5.040322580645161e-07,
1745
+ "loss": 0.2172,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.996,
1750
+ "grad_norm": 9.70234203338623,
1751
+ "learning_rate": 3.024193548387097e-07,
1752
+ "loss": 0.2253,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 1.0,
1757
+ "grad_norm": 18.04475975036621,
1758
+ "learning_rate": 1.0080645161290322e-07,
1759
+ "loss": 0.2832,
1760
+ "step": 2500
1761
  }
1762
  ],
1763
  "logging_steps": 10,
 
1772
  "should_evaluate": false,
1773
  "should_log": false,
1774
  "should_save": true,
1775
+ "should_training_stop": true
1776
  },
1777
  "attributes": {}
1778
  }
1779
  },
1780
+ "total_flos": 4.52093647947264e+16,
1781
  "train_batch_size": 2,
1782
  "trial_name": null,
1783
  "trial_params": null