ljcamargo commited on
Commit
7fc4e87
·
verified ·
1 Parent(s): d086c9b

Training in progress, step 2400, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15f22026b9de091fea7c6677cc4018fa6aa370338d59d131114563939cf90b17
3
  size 2558403928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2239a0845b46b95928bcbeed801573f294fc2cb9b100e0b10c5cbdef012ea8a5
3
  size 2558403928
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d65b81af14a5a134ba9a873731d09629269da26fa8b87244dac0d9301cc842b1
3
  size 1313638993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acb8d32926cd3273ed47be6ba17df67bab0f6dec6fb9885f960b784af33bbee9
3
  size 1313638993
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ac2c5f32acf6ee420ceadbd9ac0d52af52081eb0d8301506f11df8d08763b6e
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eb3097812b21e90074b0ecab86ca2780198581fab1852bdc56627bd62753aa6
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc99556bf27209385963813e3570510732839e6002d61d657266050e280a33eb
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7da7c5085795b13d2bf0030671cbddb9f62ae43221bf1424a3830d4cf8c19012
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9206e9d78f9c7162a065f53b3bc585a6b187e23b8f10b603eff8a1391fd60fea
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba415cf49e6d172669840b43b2ceb814d3524d31ee7518919fd94765d7715e45
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6800000000000002,
6
  "eval_steps": 500,
7
- "global_step": 2100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1485,6 +1485,216 @@
1485
  "learning_rate": 1.3506931508887333e-05,
1486
  "loss": 4.0234,
1487
  "step": 2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1488
  }
1489
  ],
1490
  "logging_steps": 10,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.92,
6
  "eval_steps": 500,
7
+ "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1485
  "learning_rate": 1.3506931508887333e-05,
1486
  "loss": 4.0234,
1487
  "step": 2100
1488
+ },
1489
+ {
1490
+ "epoch": 1.688,
1491
+ "grad_norm": 6.967901706695557,
1492
+ "learning_rate": 1.2856314193601216e-05,
1493
+ "loss": 4.2137,
1494
+ "step": 2110
1495
+ },
1496
+ {
1497
+ "epoch": 1.696,
1498
+ "grad_norm": 7.635384559631348,
1499
+ "learning_rate": 1.2220678396326678e-05,
1500
+ "loss": 4.38,
1501
+ "step": 2120
1502
+ },
1503
+ {
1504
+ "epoch": 1.704,
1505
+ "grad_norm": 9.351762771606445,
1506
+ "learning_rate": 1.160013339393281e-05,
1507
+ "loss": 4.4418,
1508
+ "step": 2130
1509
+ },
1510
+ {
1511
+ "epoch": 1.712,
1512
+ "grad_norm": 7.01410436630249,
1513
+ "learning_rate": 1.0994785868918101e-05,
1514
+ "loss": 4.4396,
1515
+ "step": 2140
1516
+ },
1517
+ {
1518
+ "epoch": 1.72,
1519
+ "grad_norm": 6.974796772003174,
1520
+ "learning_rate": 1.040473989106988e-05,
1521
+ "loss": 4.5075,
1522
+ "step": 2150
1523
+ },
1524
+ {
1525
+ "epoch": 1.728,
1526
+ "grad_norm": 7.545105934143066,
1527
+ "learning_rate": 9.830096899572927e-06,
1528
+ "loss": 4.7132,
1529
+ "step": 2160
1530
+ },
1531
+ {
1532
+ "epoch": 1.736,
1533
+ "grad_norm": 7.074202537536621,
1534
+ "learning_rate": 9.270955685570226e-06,
1535
+ "loss": 4.2393,
1536
+ "step": 2170
1537
+ },
1538
+ {
1539
+ "epoch": 1.744,
1540
+ "grad_norm": 6.088277339935303,
1541
+ "learning_rate": 8.727412375179156e-06,
1542
+ "loss": 4.8092,
1543
+ "step": 2180
1544
+ },
1545
+ {
1546
+ "epoch": 1.752,
1547
+ "grad_norm": 5.859469413757324,
1548
+ "learning_rate": 8.199560412965634e-06,
1549
+ "loss": 4.6403,
1550
+ "step": 2190
1551
+ },
1552
+ {
1553
+ "epoch": 1.76,
1554
+ "grad_norm": 6.770120143890381,
1555
+ "learning_rate": 7.687490545879461e-06,
1556
+ "loss": 4.2544,
1557
+ "step": 2200
1558
+ },
1559
+ {
1560
+ "epoch": 1.768,
1561
+ "grad_norm": 8.092095375061035,
1562
+ "learning_rate": 7.191290807653251e-06,
1563
+ "loss": 4.454,
1564
+ "step": 2210
1565
+ },
1566
+ {
1567
+ "epoch": 1.776,
1568
+ "grad_norm": 6.498937606811523,
1569
+ "learning_rate": 6.711046503667983e-06,
1570
+ "loss": 4.0243,
1571
+ "step": 2220
1572
+ },
1573
+ {
1574
+ "epoch": 1.784,
1575
+ "grad_norm": 5.894200801849365,
1576
+ "learning_rate": 6.24684019628744e-06,
1577
+ "loss": 4.0666,
1578
+ "step": 2230
1579
+ },
1580
+ {
1581
+ "epoch": 1.792,
1582
+ "grad_norm": 6.309505462646484,
1583
+ "learning_rate": 5.79875169066435e-06,
1584
+ "loss": 4.2441,
1585
+ "step": 2240
1586
+ },
1587
+ {
1588
+ "epoch": 1.8,
1589
+ "grad_norm": 6.20737886428833,
1590
+ "learning_rate": 5.366858021020471e-06,
1591
+ "loss": 4.1951,
1592
+ "step": 2250
1593
+ },
1594
+ {
1595
+ "epoch": 1.808,
1596
+ "grad_norm": 8.314438819885254,
1597
+ "learning_rate": 4.951233437403102e-06,
1598
+ "loss": 4.4043,
1599
+ "step": 2260
1600
+ },
1601
+ {
1602
+ "epoch": 1.8159999999999998,
1603
+ "grad_norm": 6.5354108810424805,
1604
+ "learning_rate": 4.551949392920118e-06,
1605
+ "loss": 4.1528,
1606
+ "step": 2270
1607
+ },
1608
+ {
1609
+ "epoch": 1.8239999999999998,
1610
+ "grad_norm": 8.503190040588379,
1611
+ "learning_rate": 4.169074531456063e-06,
1612
+ "loss": 3.8358,
1613
+ "step": 2280
1614
+ },
1615
+ {
1616
+ "epoch": 1.8319999999999999,
1617
+ "grad_norm": 7.091247081756592,
1618
+ "learning_rate": 3.802674675870932e-06,
1619
+ "loss": 4.3702,
1620
+ "step": 2290
1621
+ },
1622
+ {
1623
+ "epoch": 1.8399999999999999,
1624
+ "grad_norm": 6.265818119049072,
1625
+ "learning_rate": 3.4528128166842033e-06,
1626
+ "loss": 4.3354,
1627
+ "step": 2300
1628
+ },
1629
+ {
1630
+ "epoch": 1.8479999999999999,
1631
+ "grad_norm": 7.026565074920654,
1632
+ "learning_rate": 3.119549101245567e-06,
1633
+ "loss": 3.9832,
1634
+ "step": 2310
1635
+ },
1636
+ {
1637
+ "epoch": 1.8559999999999999,
1638
+ "grad_norm": 6.407781600952148,
1639
+ "learning_rate": 2.8029408233946177e-06,
1640
+ "loss": 4.1522,
1641
+ "step": 2320
1642
+ },
1643
+ {
1644
+ "epoch": 1.8639999999999999,
1645
+ "grad_norm": 13.10364055633545,
1646
+ "learning_rate": 2.503042413611001e-06,
1647
+ "loss": 4.354,
1648
+ "step": 2330
1649
+ },
1650
+ {
1651
+ "epoch": 1.8719999999999999,
1652
+ "grad_norm": 12.458951950073242,
1653
+ "learning_rate": 2.219905429656899e-06,
1654
+ "loss": 4.5043,
1655
+ "step": 2340
1656
+ },
1657
+ {
1658
+ "epoch": 1.88,
1659
+ "grad_norm": 9.545763969421387,
1660
+ "learning_rate": 1.9535785477133195e-06,
1661
+ "loss": 4.2234,
1662
+ "step": 2350
1663
+ },
1664
+ {
1665
+ "epoch": 1.888,
1666
+ "grad_norm": 9.243309020996094,
1667
+ "learning_rate": 1.7041075540118578e-06,
1668
+ "loss": 4.2382,
1669
+ "step": 2360
1670
+ },
1671
+ {
1672
+ "epoch": 1.896,
1673
+ "grad_norm": 6.87458610534668,
1674
+ "learning_rate": 1.4715353369631924e-06,
1675
+ "loss": 3.9852,
1676
+ "step": 2370
1677
+ },
1678
+ {
1679
+ "epoch": 1.904,
1680
+ "grad_norm": 6.957976341247559,
1681
+ "learning_rate": 1.2559018797838384e-06,
1682
+ "loss": 3.9335,
1683
+ "step": 2380
1684
+ },
1685
+ {
1686
+ "epoch": 1.912,
1687
+ "grad_norm": 5.689143657684326,
1688
+ "learning_rate": 1.0572442536223692e-06,
1689
+ "loss": 4.0,
1690
+ "step": 2390
1691
+ },
1692
+ {
1693
+ "epoch": 1.92,
1694
+ "grad_norm": 5.896793842315674,
1695
+ "learning_rate": 8.755966111861913e-07,
1696
+ "loss": 4.2807,
1697
+ "step": 2400
1698
  }
1699
  ],
1700
  "logging_steps": 10,