ljcamargo commited on
Commit
7059436
·
verified ·
1 Parent(s): 33f17e6

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92ecbb21d1e0fe04a76374b42b85859839cd5847c3b922def8d9c835efea99e0
3
  size 1917255968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fffc04b59c62fd75993065cac00d29eabde87430fe6889a8d0a88622aaf4bff7
3
  size 1917255968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb40c6114ec922d7714527c3e12b5ccaf476fde4ca857ba014ddc6cfb4ede0c4
3
- size 2479129381
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734307ae4ce6cfe0da3c2c0adc7de19df318654f89f713e32d45992201dd3c38
3
+ size 2479961379
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:946649cc0ec301f9f67d287ff0bc2472a821330c5cd88309b298943469bb0e90
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e67abcfba71a76609e77cfb89c9a6ffd007caab84f81a31e8c3a8feeab5e1904
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2878531755614586,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1415,6 +1415,356 @@
1415
  "learning_rate": 1.541138862413009e-05,
1416
  "loss": 0.1555,
1417
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1418
  }
1419
  ],
1420
  "logging_steps": 10,
@@ -1434,7 +1784,7 @@
1434
  "attributes": {}
1435
  }
1436
  },
1437
- "total_flos": 1.5616435963670323e+17,
1438
  "train_batch_size": 4,
1439
  "trial_name": null,
1440
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.609836593415439,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1415
  "learning_rate": 1.541138862413009e-05,
1416
  "loss": 0.1555,
1417
  "step": 2000
1418
+ },
1419
+ {
1420
+ "epoch": 1.2942928439185382,
1421
+ "grad_norm": 2.5,
1422
+ "learning_rate": 1.5165978463439823e-05,
1423
+ "loss": 0.129,
1424
+ "step": 2010
1425
+ },
1426
+ {
1427
+ "epoch": 1.300732512275618,
1428
+ "grad_norm": 7.9375,
1429
+ "learning_rate": 1.4921684336293367e-05,
1430
+ "loss": 0.1734,
1431
+ "step": 2020
1432
+ },
1433
+ {
1434
+ "epoch": 1.3071721806326975,
1435
+ "grad_norm": 6.53125,
1436
+ "learning_rate": 1.467853396689688e-05,
1437
+ "loss": 0.1907,
1438
+ "step": 2030
1439
+ },
1440
+ {
1441
+ "epoch": 1.313611848989777,
1442
+ "grad_norm": 11.125,
1443
+ "learning_rate": 1.4436554949654906e-05,
1444
+ "loss": 0.139,
1445
+ "step": 2040
1446
+ },
1447
+ {
1448
+ "epoch": 1.3200515173468568,
1449
+ "grad_norm": 4.5625,
1450
+ "learning_rate": 1.4195774746038723e-05,
1451
+ "loss": 0.1053,
1452
+ "step": 2050
1453
+ },
1454
+ {
1455
+ "epoch": 1.3264911857039363,
1456
+ "grad_norm": 6.40625,
1457
+ "learning_rate": 1.3956220681469866e-05,
1458
+ "loss": 0.1758,
1459
+ "step": 2060
1460
+ },
1461
+ {
1462
+ "epoch": 1.3329308540610159,
1463
+ "grad_norm": 5.40625,
1464
+ "learning_rate": 1.3717919942219067e-05,
1465
+ "loss": 0.1186,
1466
+ "step": 2070
1467
+ },
1468
+ {
1469
+ "epoch": 1.3393705224180954,
1470
+ "grad_norm": 9.5625,
1471
+ "learning_rate": 1.3480899572320893e-05,
1472
+ "loss": 0.1283,
1473
+ "step": 2080
1474
+ },
1475
+ {
1476
+ "epoch": 1.345810190775175,
1477
+ "grad_norm": 7.375,
1478
+ "learning_rate": 1.3245186470504647e-05,
1479
+ "loss": 0.1294,
1480
+ "step": 2090
1481
+ },
1482
+ {
1483
+ "epoch": 1.3522498591322547,
1484
+ "grad_norm": 8.875,
1485
+ "learning_rate": 1.3010807387141738e-05,
1486
+ "loss": 0.1039,
1487
+ "step": 2100
1488
+ },
1489
+ {
1490
+ "epoch": 1.3586895274893342,
1491
+ "grad_norm": 9.75,
1492
+ "learning_rate": 1.2777788921209805e-05,
1493
+ "loss": 0.1128,
1494
+ "step": 2110
1495
+ },
1496
+ {
1497
+ "epoch": 1.3651291958464138,
1498
+ "grad_norm": 4.4375,
1499
+ "learning_rate": 1.25461575172741e-05,
1500
+ "loss": 0.1235,
1501
+ "step": 2120
1502
+ },
1503
+ {
1504
+ "epoch": 1.3715688642034936,
1505
+ "grad_norm": 9.1875,
1506
+ "learning_rate": 1.2315939462486395e-05,
1507
+ "loss": 0.1536,
1508
+ "step": 2130
1509
+ },
1510
+ {
1511
+ "epoch": 1.378008532560573,
1512
+ "grad_norm": 4.75,
1513
+ "learning_rate": 1.2087160883601695e-05,
1514
+ "loss": 0.1011,
1515
+ "step": 2140
1516
+ },
1517
+ {
1518
+ "epoch": 1.3844482009176526,
1519
+ "grad_norm": 7.0,
1520
+ "learning_rate": 1.185984774401325e-05,
1521
+ "loss": 0.1795,
1522
+ "step": 2150
1523
+ },
1524
+ {
1525
+ "epoch": 1.3908878692747324,
1526
+ "grad_norm": 4.25,
1527
+ "learning_rate": 1.163402584080597e-05,
1528
+ "loss": 0.089,
1529
+ "step": 2160
1530
+ },
1531
+ {
1532
+ "epoch": 1.397327537631812,
1533
+ "grad_norm": 6.28125,
1534
+ "learning_rate": 1.1409720801828849e-05,
1535
+ "loss": 0.1405,
1536
+ "step": 2170
1537
+ },
1538
+ {
1539
+ "epoch": 1.4037672059888915,
1540
+ "grad_norm": 2.84375,
1541
+ "learning_rate": 1.1186958082786517e-05,
1542
+ "loss": 0.1533,
1543
+ "step": 2180
1544
+ },
1545
+ {
1546
+ "epoch": 1.4102068743459713,
1547
+ "grad_norm": 4.75,
1548
+ "learning_rate": 1.096576296435034e-05,
1549
+ "loss": 0.1359,
1550
+ "step": 2190
1551
+ },
1552
+ {
1553
+ "epoch": 1.4166465427030508,
1554
+ "grad_norm": 13.875,
1555
+ "learning_rate": 1.0746160549289424e-05,
1556
+ "loss": 0.1046,
1557
+ "step": 2200
1558
+ },
1559
+ {
1560
+ "epoch": 1.4230862110601303,
1561
+ "grad_norm": 5.375,
1562
+ "learning_rate": 1.0528175759621727e-05,
1563
+ "loss": 0.1285,
1564
+ "step": 2210
1565
+ },
1566
+ {
1567
+ "epoch": 1.42952587941721,
1568
+ "grad_norm": 4.375,
1569
+ "learning_rate": 1.0311833333785807e-05,
1570
+ "loss": 0.163,
1571
+ "step": 2220
1572
+ },
1573
+ {
1574
+ "epoch": 1.4359655477742896,
1575
+ "grad_norm": 7.3125,
1576
+ "learning_rate": 1.0097157823833248e-05,
1577
+ "loss": 0.2187,
1578
+ "step": 2230
1579
+ },
1580
+ {
1581
+ "epoch": 1.4424052161313692,
1582
+ "grad_norm": 5.9375,
1583
+ "learning_rate": 9.884173592642374e-06,
1584
+ "loss": 0.1137,
1585
+ "step": 2240
1586
+ },
1587
+ {
1588
+ "epoch": 1.448844884488449,
1589
+ "grad_norm": 12.625,
1590
+ "learning_rate": 9.672904811153363e-06,
1591
+ "loss": 0.1985,
1592
+ "step": 2250
1593
+ },
1594
+ {
1595
+ "epoch": 1.4552845528455285,
1596
+ "grad_norm": 3.96875,
1597
+ "learning_rate": 9.463375455625183e-06,
1598
+ "loss": 0.0877,
1599
+ "step": 2260
1600
+ },
1601
+ {
1602
+ "epoch": 1.461724221202608,
1603
+ "grad_norm": 5.5625,
1604
+ "learning_rate": 9.255609304914556e-06,
1605
+ "loss": 0.1086,
1606
+ "step": 2270
1607
+ },
1608
+ {
1609
+ "epoch": 1.4681638895596878,
1610
+ "grad_norm": 3.0,
1611
+ "learning_rate": 9.049629937777382e-06,
1612
+ "loss": 0.1154,
1613
+ "step": 2280
1614
+ },
1615
+ {
1616
+ "epoch": 1.4746035579167673,
1617
+ "grad_norm": 3.671875,
1618
+ "learning_rate": 8.8454607301929e-06,
1619
+ "loss": 0.1317,
1620
+ "step": 2290
1621
+ },
1622
+ {
1623
+ "epoch": 1.4810432262738469,
1624
+ "grad_norm": 4.5625,
1625
+ "learning_rate": 8.643124852710749e-06,
1626
+ "loss": 0.0975,
1627
+ "step": 2300
1628
+ },
1629
+ {
1630
+ "epoch": 1.4874828946309266,
1631
+ "grad_norm": 6.28125,
1632
+ "learning_rate": 8.442645267821458e-06,
1633
+ "loss": 0.1085,
1634
+ "step": 2310
1635
+ },
1636
+ {
1637
+ "epoch": 1.4939225629880062,
1638
+ "grad_norm": 4.84375,
1639
+ "learning_rate": 8.244044727350492e-06,
1640
+ "loss": 0.1105,
1641
+ "step": 2320
1642
+ },
1643
+ {
1644
+ "epoch": 1.5003622313450857,
1645
+ "grad_norm": 4.5625,
1646
+ "learning_rate": 8.047345769876204e-06,
1647
+ "loss": 0.1523,
1648
+ "step": 2330
1649
+ },
1650
+ {
1651
+ "epoch": 1.5068018997021655,
1652
+ "grad_norm": 12.5,
1653
+ "learning_rate": 7.85257071817202e-06,
1654
+ "loss": 0.1637,
1655
+ "step": 2340
1656
+ },
1657
+ {
1658
+ "epoch": 1.5132415680592448,
1659
+ "grad_norm": 4.125,
1660
+ "learning_rate": 7.659741676673066e-06,
1661
+ "loss": 0.1124,
1662
+ "step": 2350
1663
+ },
1664
+ {
1665
+ "epoch": 1.5196812364163246,
1666
+ "grad_norm": 9.0,
1667
+ "learning_rate": 7.468880528967651e-06,
1668
+ "loss": 0.0965,
1669
+ "step": 2360
1670
+ },
1671
+ {
1672
+ "epoch": 1.5261209047734041,
1673
+ "grad_norm": 5.53125,
1674
+ "learning_rate": 7.280008935313701e-06,
1675
+ "loss": 0.0999,
1676
+ "step": 2370
1677
+ },
1678
+ {
1679
+ "epoch": 1.5325605731304837,
1680
+ "grad_norm": 3.578125,
1681
+ "learning_rate": 7.093148330180649e-06,
1682
+ "loss": 0.1199,
1683
+ "step": 2380
1684
+ },
1685
+ {
1686
+ "epoch": 1.5390002414875634,
1687
+ "grad_norm": 4.71875,
1688
+ "learning_rate": 6.908319919816877e-06,
1689
+ "loss": 0.1595,
1690
+ "step": 2390
1691
+ },
1692
+ {
1693
+ "epoch": 1.545439909844643,
1694
+ "grad_norm": 5.4375,
1695
+ "learning_rate": 6.725544679843101e-06,
1696
+ "loss": 0.0966,
1697
+ "step": 2400
1698
+ },
1699
+ {
1700
+ "epoch": 1.5518795782017225,
1701
+ "grad_norm": 2.765625,
1702
+ "learning_rate": 6.5448433528718876e-06,
1703
+ "loss": 0.0993,
1704
+ "step": 2410
1705
+ },
1706
+ {
1707
+ "epoch": 1.5583192465588023,
1708
+ "grad_norm": 7.875,
1709
+ "learning_rate": 6.3662364461536725e-06,
1710
+ "loss": 0.1303,
1711
+ "step": 2420
1712
+ },
1713
+ {
1714
+ "epoch": 1.5647589149158818,
1715
+ "grad_norm": 4.625,
1716
+ "learning_rate": 6.189744229249406e-06,
1717
+ "loss": 0.1044,
1718
+ "step": 2430
1719
+ },
1720
+ {
1721
+ "epoch": 1.5711985832729614,
1722
+ "grad_norm": 5.15625,
1723
+ "learning_rate": 6.015386731730277e-06,
1724
+ "loss": 0.1186,
1725
+ "step": 2440
1726
+ },
1727
+ {
1728
+ "epoch": 1.5776382516300411,
1729
+ "grad_norm": 5.96875,
1730
+ "learning_rate": 5.843183740904564e-06,
1731
+ "loss": 0.1427,
1732
+ "step": 2450
1733
+ },
1734
+ {
1735
+ "epoch": 1.5840779199871207,
1736
+ "grad_norm": 8.5625,
1737
+ "learning_rate": 5.673154799572064e-06,
1738
+ "loss": 0.0971,
1739
+ "step": 2460
1740
+ },
1741
+ {
1742
+ "epoch": 1.5905175883442002,
1743
+ "grad_norm": 4.9375,
1744
+ "learning_rate": 5.505319203806239e-06,
1745
+ "loss": 0.0823,
1746
+ "step": 2470
1747
+ },
1748
+ {
1749
+ "epoch": 1.59695725670128,
1750
+ "grad_norm": 8.625,
1751
+ "learning_rate": 5.339696000764346e-06,
1752
+ "loss": 0.0963,
1753
+ "step": 2480
1754
+ },
1755
+ {
1756
+ "epoch": 1.6033969250583595,
1757
+ "grad_norm": 4.25,
1758
+ "learning_rate": 5.17630398652584e-06,
1759
+ "loss": 0.1075,
1760
+ "step": 2490
1761
+ },
1762
+ {
1763
+ "epoch": 1.609836593415439,
1764
+ "grad_norm": 5.6875,
1765
+ "learning_rate": 5.01516170395929e-06,
1766
+ "loss": 0.0782,
1767
+ "step": 2500
1768
  }
1769
  ],
1770
  "logging_steps": 10,
 
1784
  "attributes": {}
1785
  }
1786
  },
1787
+ "total_flos": 1.9527822955090944e+17,
1788
  "train_batch_size": 4,
1789
  "trial_name": null,
1790
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ef279fd8d8317bf8b36d84bffa82d19dc9808ff2d80cffafbb2e258d411547d
3
  size 6417
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a850724e0b50935e4c4d54b3c968fe94638af4b1c59c2e07336ea7136cfb41e
3
  size 6417