Azrail commited on
Commit
70a15c9
·
verified ·
1 Parent(s): 80e23f8

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc111bfdca8ed66f8f79b1bd7bfe63b080b3cd0c7813c6a04d18b038095b2076
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c29be95fa54dd4f69e3ce5004267e101dda94af373aaf70d214f9638f89adb
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:012e342ea6ebe917c7ae83cdc62c745f971ca211463bb0b9f52f2a90259f7532
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160ce5599809f8d46aece51b073c165f9acf6e1fe58b27e320879b3a36e071e8
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7086e9014350d6181c560b7f34a0e20e6a473f5b7d4ab3f99a9989189826cf1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b010e746bb61fb9d87a1e1dd8c3322ee496b9630835db02b085aa96f590f941
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2eb03be752bb38e04dcd3624dd94e7f08bf42e1daf7a9f0b1188fbafdad08914
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fbdeae10eb93c5869f1e3bdcd9e48de4b232e3e091208a64c7200c54dba28a3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6897113629162455,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1534,11 +1534,229 @@
1534
  "eval_steps_per_second": 20.529,
1535
  "num_input_tokens_seen": 3382174368,
1536
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1537
  }
1538
  ],
1539
  "logging_steps": 50,
1540
  "max_steps": 16568,
1541
- "num_input_tokens_seen": 3382174368,
1542
  "num_train_epochs": 4,
1543
  "save_steps": 1000,
1544
  "stateful_callbacks": {
@@ -1553,7 +1771,7 @@
1553
  "attributes": {}
1554
  }
1555
  },
1556
- "total_flos": 9.047646534618317e+17,
1557
  "train_batch_size": 16,
1558
  "trial_name": null,
1559
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9311224104893099,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1534
  "eval_steps_per_second": 20.529,
1535
  "num_input_tokens_seen": 3382174368,
1536
  "step": 7000
1537
+ },
1538
+ {
1539
+ "epoch": 1.7017819152948985,
1540
+ "grad_norm": 0.2421875,
1541
+ "learning_rate": 3.590614154217595e-05,
1542
+ "loss": 2.0955,
1543
+ "mean_token_accuracy": 0.55618498865515,
1544
+ "num_input_tokens_seen": 3406371872,
1545
+ "num_tokens": 1435550428.0,
1546
+ "step": 7050
1547
+ },
1548
+ {
1549
+ "epoch": 1.713852467673552,
1550
+ "grad_norm": 0.275390625,
1551
+ "learning_rate": 3.571751923947488e-05,
1552
+ "loss": 2.0919,
1553
+ "mean_token_accuracy": 0.5553148340806365,
1554
+ "num_input_tokens_seen": 3430521536,
1555
+ "num_tokens": 1445655664.0,
1556
+ "step": 7100
1557
+ },
1558
+ {
1559
+ "epoch": 1.725923020052205,
1560
+ "grad_norm": 0.28125,
1561
+ "learning_rate": 3.5528896936773805e-05,
1562
+ "loss": 2.0906,
1563
+ "mean_token_accuracy": 0.5547146466746926,
1564
+ "num_input_tokens_seen": 3454573728,
1565
+ "num_tokens": 1455866347.0,
1566
+ "step": 7150
1567
+ },
1568
+ {
1569
+ "epoch": 1.7379935724308584,
1570
+ "grad_norm": 0.265625,
1571
+ "learning_rate": 3.534027463407274e-05,
1572
+ "loss": 2.098,
1573
+ "mean_token_accuracy": 0.5544127273187042,
1574
+ "num_input_tokens_seen": 3478641648,
1575
+ "num_tokens": 1465996080.0,
1576
+ "step": 7200
1577
+ },
1578
+ {
1579
+ "epoch": 1.7500641248095117,
1580
+ "grad_norm": 0.275390625,
1581
+ "learning_rate": 3.515165233137166e-05,
1582
+ "loss": 2.1039,
1583
+ "mean_token_accuracy": 0.5536015385761857,
1584
+ "num_input_tokens_seen": 3502632176,
1585
+ "num_tokens": 1476133852.0,
1586
+ "step": 7250
1587
+ },
1588
+ {
1589
+ "epoch": 1.7621346771881647,
1590
+ "grad_norm": 0.2470703125,
1591
+ "learning_rate": 3.496303002867059e-05,
1592
+ "loss": 2.0974,
1593
+ "mean_token_accuracy": 0.5540728243440389,
1594
+ "num_input_tokens_seen": 3526775024,
1595
+ "num_tokens": 1486346988.0,
1596
+ "step": 7300
1597
+ },
1598
+ {
1599
+ "epoch": 1.774205229566818,
1600
+ "grad_norm": 0.24609375,
1601
+ "learning_rate": 3.477440772596952e-05,
1602
+ "loss": 2.0986,
1603
+ "mean_token_accuracy": 0.5539619905874134,
1604
+ "num_input_tokens_seen": 3551057232,
1605
+ "num_tokens": 1496540907.0,
1606
+ "step": 7350
1607
+ },
1608
+ {
1609
+ "epoch": 1.7862757819454713,
1610
+ "grad_norm": 0.2734375,
1611
+ "learning_rate": 3.4585785423268445e-05,
1612
+ "loss": 2.097,
1613
+ "mean_token_accuracy": 0.5543709811195732,
1614
+ "num_input_tokens_seen": 3575145120,
1615
+ "num_tokens": 1506733776.0,
1616
+ "step": 7400
1617
+ },
1618
+ {
1619
+ "epoch": 1.7983463343241244,
1620
+ "grad_norm": 0.263671875,
1621
+ "learning_rate": 3.4397163120567377e-05,
1622
+ "loss": 2.1054,
1623
+ "mean_token_accuracy": 0.5529748990386725,
1624
+ "num_input_tokens_seen": 3599324048,
1625
+ "num_tokens": 1516881364.0,
1626
+ "step": 7450
1627
+ },
1628
+ {
1629
+ "epoch": 1.810416886702778,
1630
+ "grad_norm": 0.26171875,
1631
+ "learning_rate": 3.420854081786631e-05,
1632
+ "loss": 2.107,
1633
+ "num_input_tokens_seen": 3623572960,
1634
+ "step": 7500
1635
+ },
1636
+ {
1637
+ "epoch": 1.810416886702778,
1638
+ "eval_loss": 1.9704335927963257,
1639
+ "eval_mean_token_accuracy": 0.5782234972207915,
1640
+ "eval_num_tokens": 1527213100.0,
1641
+ "eval_runtime": 130.0921,
1642
+ "eval_samples_per_second": 82.342,
1643
+ "eval_steps_per_second": 20.585,
1644
+ "num_input_tokens_seen": 3623572960,
1645
+ "step": 7500
1646
+ },
1647
+ {
1648
+ "epoch": 1.822487439081431,
1649
+ "grad_norm": 0.236328125,
1650
+ "learning_rate": 3.4019918515165234e-05,
1651
+ "loss": 2.0859,
1652
+ "mean_token_accuracy": 0.5543704128451645,
1653
+ "num_input_tokens_seen": 3647842624,
1654
+ "num_tokens": 1537348409.0,
1655
+ "step": 7550
1656
+ },
1657
+ {
1658
+ "epoch": 1.8345579914600842,
1659
+ "grad_norm": 0.275390625,
1660
+ "learning_rate": 3.3831296212464166e-05,
1661
+ "loss": 2.0964,
1662
+ "mean_token_accuracy": 0.5545977150648832,
1663
+ "num_input_tokens_seen": 3671838176,
1664
+ "num_tokens": 1547405691.0,
1665
+ "step": 7600
1666
+ },
1667
+ {
1668
+ "epoch": 1.8466285438387375,
1669
+ "grad_norm": 0.271484375,
1670
+ "learning_rate": 3.364267390976309e-05,
1671
+ "loss": 2.1049,
1672
+ "mean_token_accuracy": 0.5531406961008907,
1673
+ "num_input_tokens_seen": 3696051376,
1674
+ "num_tokens": 1557614207.0,
1675
+ "step": 7650
1676
+ },
1677
+ {
1678
+ "epoch": 1.8586990962173906,
1679
+ "grad_norm": 0.26171875,
1680
+ "learning_rate": 3.345405160706202e-05,
1681
+ "loss": 2.0976,
1682
+ "mean_token_accuracy": 0.5547824421525002,
1683
+ "num_input_tokens_seen": 3720242912,
1684
+ "num_tokens": 1567813391.0,
1685
+ "step": 7700
1686
+ },
1687
+ {
1688
+ "epoch": 1.8707696485960439,
1689
+ "grad_norm": 0.26171875,
1690
+ "learning_rate": 3.326542930436095e-05,
1691
+ "loss": 2.0941,
1692
+ "mean_token_accuracy": 0.5552040388435125,
1693
+ "num_input_tokens_seen": 3744303744,
1694
+ "num_tokens": 1577952759.0,
1695
+ "step": 7750
1696
+ },
1697
+ {
1698
+ "epoch": 1.8828402009746972,
1699
+ "grad_norm": 0.2734375,
1700
+ "learning_rate": 3.3076807001659874e-05,
1701
+ "loss": 2.107,
1702
+ "mean_token_accuracy": 0.5526882111281156,
1703
+ "num_input_tokens_seen": 3768596640,
1704
+ "num_tokens": 1588260331.0,
1705
+ "step": 7800
1706
+ },
1707
+ {
1708
+ "epoch": 1.8949107533533502,
1709
+ "grad_norm": 0.2890625,
1710
+ "learning_rate": 3.2888184698958806e-05,
1711
+ "loss": 2.0974,
1712
+ "mean_token_accuracy": 0.5548465251550079,
1713
+ "num_input_tokens_seen": 3792634928,
1714
+ "num_tokens": 1598444722.0,
1715
+ "step": 7850
1716
+ },
1717
+ {
1718
+ "epoch": 1.9069813057320035,
1719
+ "grad_norm": 0.251953125,
1720
+ "learning_rate": 3.269956239625773e-05,
1721
+ "loss": 2.0957,
1722
+ "mean_token_accuracy": 0.554438531845808,
1723
+ "num_input_tokens_seen": 3816849872,
1724
+ "num_tokens": 1608557410.0,
1725
+ "step": 7900
1726
+ },
1727
+ {
1728
+ "epoch": 1.9190518581106568,
1729
+ "grad_norm": 0.2490234375,
1730
+ "learning_rate": 3.251094009355666e-05,
1731
+ "loss": 2.108,
1732
+ "mean_token_accuracy": 0.5528679783269763,
1733
+ "num_input_tokens_seen": 3840903600,
1734
+ "num_tokens": 1618721583.0,
1735
+ "step": 7950
1736
+ },
1737
+ {
1738
+ "epoch": 1.9311224104893099,
1739
+ "grad_norm": 0.263671875,
1740
+ "learning_rate": 3.2322317790855595e-05,
1741
+ "loss": 2.0908,
1742
+ "num_input_tokens_seen": 3864935104,
1743
+ "step": 8000
1744
+ },
1745
+ {
1746
+ "epoch": 1.9311224104893099,
1747
+ "eval_loss": 1.9698705673217773,
1748
+ "eval_mean_token_accuracy": 0.5783014823866923,
1749
+ "eval_num_tokens": 1628824365.0,
1750
+ "eval_runtime": 130.0192,
1751
+ "eval_samples_per_second": 82.388,
1752
+ "eval_steps_per_second": 20.597,
1753
+ "num_input_tokens_seen": 3864935104,
1754
+ "step": 8000
1755
  }
1756
  ],
1757
  "logging_steps": 50,
1758
  "max_steps": 16568,
1759
+ "num_input_tokens_seen": 3864935104,
1760
  "num_train_epochs": 4,
1761
  "save_steps": 1000,
1762
  "stateful_callbacks": {
 
1771
  "attributes": {}
1772
  }
1773
  },
1774
+ "total_flos": 1.033907862086615e+18,
1775
  "train_batch_size": 16,
1776
  "trial_name": null,
1777
  "trial_params": null