owenisas commited on
Commit
b7c9249
·
verified ·
1 Parent(s): 495018c

Training in progress, step 250, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13d51391eae20fd1e2416fc130cc061ef37cae0dce1392cfc4c82bb34308c5af
3
  size 3537299144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2a5ff429f650539cd5c6ad9ea7f9569fd24863056cad28726290ed985d9fea
3
  size 3537299144
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:288427ba428a2c192b29b8b7ec8c3fc4db475d46a549e17e5af85ee285de0e47
3
  size 1830175435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85bb5e2364254f0b84ca558a536ce2983868014e01a90e171fbe557dd01d62f6
3
  size 1830175435
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89645f9f6a2a8823b334e0748e4f5ea683a27b3d02632f0a241397db55d82fad
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d895ccae2b55d4ea213653ca4a80d00de131463e105716eab1b7022906f260bf
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9185b2c165caf80f353544d8268b2b79fb90be99e8dae42f53266d3abff70104
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b7f2a236446ef1e40ceb20dfad68baf17d74c3d4a45e7640820b9ddfc1c6c59
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7511737089201878,
6
  "eval_steps": 50,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1440,6 +1440,364 @@
1440
  "eval_samples_per_second": 2.575,
1441
  "eval_steps_per_second": 0.644,
1442
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1443
  }
1444
  ],
1445
  "logging_steps": 1,
@@ -1459,7 +1817,7 @@
1459
  "attributes": {}
1460
  }
1461
  },
1462
- "total_flos": 2.932889770721088e+18,
1463
  "train_batch_size": 4,
1464
  "trial_name": null,
1465
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9389671361502347,
6
  "eval_steps": 50,
7
+ "global_step": 250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1440
  "eval_samples_per_second": 2.575,
1441
  "eval_steps_per_second": 0.644,
1442
  "step": 200
1443
+ },
1444
+ {
1445
+ "epoch": 0.7549295774647887,
1446
+ "grad_norm": 0.4975653886795044,
1447
+ "learning_rate": 3.0572508230355246e-05,
1448
+ "loss": 3.763,
1449
+ "step": 201
1450
+ },
1451
+ {
1452
+ "epoch": 0.7586854460093897,
1453
+ "grad_norm": 0.5943359136581421,
1454
+ "learning_rate": 2.971452939326802e-05,
1455
+ "loss": 4.1011,
1456
+ "step": 202
1457
+ },
1458
+ {
1459
+ "epoch": 0.7624413145539906,
1460
+ "grad_norm": 0.5947958827018738,
1461
+ "learning_rate": 2.8866656049429162e-05,
1462
+ "loss": 3.837,
1463
+ "step": 203
1464
+ },
1465
+ {
1466
+ "epoch": 0.7661971830985915,
1467
+ "grad_norm": 0.55486661195755,
1468
+ "learning_rate": 2.8029010104237785e-05,
1469
+ "loss": 3.773,
1470
+ "step": 204
1471
+ },
1472
+ {
1473
+ "epoch": 0.7699530516431925,
1474
+ "grad_norm": 0.6001894474029541,
1475
+ "learning_rate": 2.720171199261987e-05,
1476
+ "loss": 4.1092,
1477
+ "step": 205
1478
+ },
1479
+ {
1480
+ "epoch": 0.7737089201877935,
1481
+ "grad_norm": 0.611171305179596,
1482
+ "learning_rate": 2.638488066171201e-05,
1483
+ "loss": 4.2872,
1484
+ "step": 206
1485
+ },
1486
+ {
1487
+ "epoch": 0.7774647887323943,
1488
+ "grad_norm": 0.5929466485977173,
1489
+ "learning_rate": 2.5578633553759878e-05,
1490
+ "loss": 4.0139,
1491
+ "step": 207
1492
+ },
1493
+ {
1494
+ "epoch": 0.7812206572769953,
1495
+ "grad_norm": 0.5859886407852173,
1496
+ "learning_rate": 2.4783086589232295e-05,
1497
+ "loss": 3.9495,
1498
+ "step": 208
1499
+ },
1500
+ {
1501
+ "epoch": 0.7849765258215963,
1502
+ "grad_norm": 0.5463722348213196,
1503
+ "learning_rate": 2.3998354150154555e-05,
1504
+ "loss": 3.7008,
1505
+ "step": 209
1506
+ },
1507
+ {
1508
+ "epoch": 0.7887323943661971,
1509
+ "grad_norm": 0.5370416045188904,
1510
+ "learning_rate": 2.3224549063662927e-05,
1511
+ "loss": 3.9123,
1512
+ "step": 210
1513
+ },
1514
+ {
1515
+ "epoch": 0.7924882629107981,
1516
+ "grad_norm": 0.5654124021530151,
1517
+ "learning_rate": 2.246178258578234e-05,
1518
+ "loss": 3.816,
1519
+ "step": 211
1520
+ },
1521
+ {
1522
+ "epoch": 0.7962441314553991,
1523
+ "grad_norm": 0.5404929518699646,
1524
+ "learning_rate": 2.171016438543059e-05,
1525
+ "loss": 3.943,
1526
+ "step": 212
1527
+ },
1528
+ {
1529
+ "epoch": 0.8,
1530
+ "grad_norm": 0.5264220237731934,
1531
+ "learning_rate": 2.096980252865005e-05,
1532
+ "loss": 3.8148,
1533
+ "step": 213
1534
+ },
1535
+ {
1536
+ "epoch": 0.8037558685446009,
1537
+ "grad_norm": 0.5364089012145996,
1538
+ "learning_rate": 2.0240803463070425e-05,
1539
+ "loss": 4.0956,
1540
+ "step": 214
1541
+ },
1542
+ {
1543
+ "epoch": 0.8075117370892019,
1544
+ "grad_norm": 0.49832502007484436,
1545
+ "learning_rate": 1.9523272002603742e-05,
1546
+ "loss": 3.5919,
1547
+ "step": 215
1548
+ },
1549
+ {
1550
+ "epoch": 0.8112676056338028,
1551
+ "grad_norm": 0.5661212205886841,
1552
+ "learning_rate": 1.8817311312374564e-05,
1553
+ "loss": 3.9309,
1554
+ "step": 216
1555
+ },
1556
+ {
1557
+ "epoch": 0.8150234741784037,
1558
+ "grad_norm": 0.6174516677856445,
1559
+ "learning_rate": 1.8123022893887065e-05,
1560
+ "loss": 4.4702,
1561
+ "step": 217
1562
+ },
1563
+ {
1564
+ "epoch": 0.8187793427230047,
1565
+ "grad_norm": 0.5399917364120483,
1566
+ "learning_rate": 1.744050657043137e-05,
1567
+ "loss": 3.8469,
1568
+ "step": 218
1569
+ },
1570
+ {
1571
+ "epoch": 0.8225352112676056,
1572
+ "grad_norm": 0.48354753851890564,
1573
+ "learning_rate": 1.6769860472731257e-05,
1574
+ "loss": 3.5587,
1575
+ "step": 219
1576
+ },
1577
+ {
1578
+ "epoch": 0.8262910798122066,
1579
+ "grad_norm": 0.5603431463241577,
1580
+ "learning_rate": 1.6111181024835e-05,
1581
+ "loss": 4.3805,
1582
+ "step": 220
1583
+ },
1584
+ {
1585
+ "epoch": 0.8300469483568075,
1586
+ "grad_norm": 0.5792990326881409,
1587
+ "learning_rate": 1.5464562930251814e-05,
1588
+ "loss": 4.2204,
1589
+ "step": 221
1590
+ },
1591
+ {
1592
+ "epoch": 0.8338028169014085,
1593
+ "grad_norm": 0.5376021862030029,
1594
+ "learning_rate": 1.4830099158335563e-05,
1595
+ "loss": 3.8365,
1596
+ "step": 222
1597
+ },
1598
+ {
1599
+ "epoch": 0.8375586854460094,
1600
+ "grad_norm": 0.5793043971061707,
1601
+ "learning_rate": 1.4207880930917871e-05,
1602
+ "loss": 4.064,
1603
+ "step": 223
1604
+ },
1605
+ {
1606
+ "epoch": 0.8413145539906103,
1607
+ "grad_norm": 0.5597378611564636,
1608
+ "learning_rate": 1.3597997709192378e-05,
1609
+ "loss": 3.8224,
1610
+ "step": 224
1611
+ },
1612
+ {
1613
+ "epoch": 0.8450704225352113,
1614
+ "grad_norm": 0.5336353182792664,
1615
+ "learning_rate": 1.3000537180852212e-05,
1616
+ "loss": 3.7203,
1617
+ "step": 225
1618
+ },
1619
+ {
1620
+ "epoch": 0.8488262910798122,
1621
+ "grad_norm": 0.640953004360199,
1622
+ "learning_rate": 1.2415585247482498e-05,
1623
+ "loss": 4.3212,
1624
+ "step": 226
1625
+ },
1626
+ {
1627
+ "epoch": 0.8525821596244132,
1628
+ "grad_norm": 0.45982062816619873,
1629
+ "learning_rate": 1.1843226012209529e-05,
1630
+ "loss": 3.6229,
1631
+ "step": 227
1632
+ },
1633
+ {
1634
+ "epoch": 0.856338028169014,
1635
+ "grad_norm": 0.5055301189422607,
1636
+ "learning_rate": 1.128354176760873e-05,
1637
+ "loss": 3.6906,
1638
+ "step": 228
1639
+ },
1640
+ {
1641
+ "epoch": 0.860093896713615,
1642
+ "grad_norm": 0.4451459050178528,
1643
+ "learning_rate": 1.073661298387265e-05,
1644
+ "loss": 3.3596,
1645
+ "step": 229
1646
+ },
1647
+ {
1648
+ "epoch": 0.863849765258216,
1649
+ "grad_norm": 0.6167091727256775,
1650
+ "learning_rate": 1.0202518297241237e-05,
1651
+ "loss": 4.6817,
1652
+ "step": 230
1653
+ },
1654
+ {
1655
+ "epoch": 0.8676056338028169,
1656
+ "grad_norm": 0.5457577705383301,
1657
+ "learning_rate": 9.681334498695648e-06,
1658
+ "loss": 4.2546,
1659
+ "step": 231
1660
+ },
1661
+ {
1662
+ "epoch": 0.8713615023474178,
1663
+ "grad_norm": 0.49405384063720703,
1664
+ "learning_rate": 9.173136522917457e-06,
1665
+ "loss": 3.7713,
1666
+ "step": 232
1667
+ },
1668
+ {
1669
+ "epoch": 0.8751173708920188,
1670
+ "grad_norm": 0.5279140472412109,
1671
+ "learning_rate": 8.677997437514629e-06,
1672
+ "loss": 3.7468,
1673
+ "step": 233
1674
+ },
1675
+ {
1676
+ "epoch": 0.8788732394366198,
1677
+ "grad_norm": 0.5161781311035156,
1678
+ "learning_rate": 8.195988432516078e-06,
1679
+ "loss": 4.2746,
1680
+ "step": 234
1681
+ },
1682
+ {
1683
+ "epoch": 0.8826291079812206,
1684
+ "grad_norm": 0.5855900049209595,
1685
+ "learning_rate": 7.727178810136093e-06,
1686
+ "loss": 4.1113,
1687
+ "step": 235
1688
+ },
1689
+ {
1690
+ "epoch": 0.8863849765258216,
1691
+ "grad_norm": 0.4686482548713684,
1692
+ "learning_rate": 7.27163597481022e-06,
1693
+ "loss": 3.3821,
1694
+ "step": 236
1695
+ },
1696
+ {
1697
+ "epoch": 0.8901408450704226,
1698
+ "grad_norm": 0.5629131197929382,
1699
+ "learning_rate": 6.829425423504021e-06,
1700
+ "loss": 4.1901,
1701
+ "step": 237
1702
+ },
1703
+ {
1704
+ "epoch": 0.8938967136150234,
1705
+ "grad_norm": 0.5782991647720337,
1706
+ "learning_rate": 6.4006107362960195e-06,
1707
+ "loss": 4.3302,
1708
+ "step": 238
1709
+ },
1710
+ {
1711
+ "epoch": 0.8976525821596244,
1712
+ "grad_norm": 0.5707590579986572,
1713
+ "learning_rate": 5.985253567236304e-06,
1714
+ "loss": 3.9955,
1715
+ "step": 239
1716
+ },
1717
+ {
1718
+ "epoch": 0.9014084507042254,
1719
+ "grad_norm": 0.4625610411167145,
1720
+ "learning_rate": 5.583413635482082e-06,
1721
+ "loss": 3.5662,
1722
+ "step": 240
1723
+ },
1724
+ {
1725
+ "epoch": 0.9051643192488263,
1726
+ "grad_norm": 0.6621753573417664,
1727
+ "learning_rate": 5.19514871671134e-06,
1728
+ "loss": 4.5634,
1729
+ "step": 241
1730
+ },
1731
+ {
1732
+ "epoch": 0.9089201877934272,
1733
+ "grad_norm": 0.4976242482662201,
1734
+ "learning_rate": 4.82051463481602e-06,
1735
+ "loss": 3.5897,
1736
+ "step": 242
1737
+ },
1738
+ {
1739
+ "epoch": 0.9126760563380282,
1740
+ "grad_norm": 0.51161789894104,
1741
+ "learning_rate": 4.45956525387573e-06,
1742
+ "loss": 3.6594,
1743
+ "step": 243
1744
+ },
1745
+ {
1746
+ "epoch": 0.9164319248826291,
1747
+ "grad_norm": 0.5785262584686279,
1748
+ "learning_rate": 4.112352470413328e-06,
1749
+ "loss": 4.031,
1750
+ "step": 244
1751
+ },
1752
+ {
1753
+ "epoch": 0.92018779342723,
1754
+ "grad_norm": 0.5122177004814148,
1755
+ "learning_rate": 3.778926205933342e-06,
1756
+ "loss": 3.6733,
1757
+ "step": 245
1758
+ },
1759
+ {
1760
+ "epoch": 0.923943661971831,
1761
+ "grad_norm": 0.5668466687202454,
1762
+ "learning_rate": 3.459334399744374e-06,
1763
+ "loss": 3.8761,
1764
+ "step": 246
1765
+ },
1766
+ {
1767
+ "epoch": 0.927699530516432,
1768
+ "grad_norm": 0.5304160714149475,
1769
+ "learning_rate": 3.1536230020664417e-06,
1770
+ "loss": 3.3638,
1771
+ "step": 247
1772
+ },
1773
+ {
1774
+ "epoch": 0.9314553990610329,
1775
+ "grad_norm": 0.5929594039916992,
1776
+ "learning_rate": 2.861835967424409e-06,
1777
+ "loss": 4.1158,
1778
+ "step": 248
1779
+ },
1780
+ {
1781
+ "epoch": 0.9352112676056338,
1782
+ "grad_norm": 0.5661305785179138,
1783
+ "learning_rate": 2.5840152483282752e-06,
1784
+ "loss": 3.8846,
1785
+ "step": 249
1786
+ },
1787
+ {
1788
+ "epoch": 0.9389671361502347,
1789
+ "grad_norm": 0.5555335879325867,
1790
+ "learning_rate": 2.3202007892413447e-06,
1791
+ "loss": 3.9409,
1792
+ "step": 250
1793
+ },
1794
+ {
1795
+ "epoch": 0.9389671361502347,
1796
+ "eval_loss": 0.4938514232635498,
1797
+ "eval_runtime": 365.2814,
1798
+ "eval_samples_per_second": 2.593,
1799
+ "eval_steps_per_second": 0.649,
1800
+ "step": 250
1801
  }
1802
  ],
1803
  "logging_steps": 1,
 
1817
  "attributes": {}
1818
  }
1819
  },
1820
+ "total_flos": 3.688311494350195e+18,
1821
  "train_batch_size": 4,
1822
  "trial_name": null,
1823
  "trial_params": null