8BitStudio commited on
Commit
969a06c
·
verified ·
1 Parent(s): 70bc29f

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e74cac81df1d9f55b850794a03cd64fce4492c0c0da5d81e9909dae9911f943
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f4e60280352c486adbf497b3e2d22d1d2fda6e133edf6aa2462b19ddeb1e8fe
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bf3be67603d9aa1f5d666b6a508c045b0cbd46af1138c22216863f18d284cfb
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c5caef45bff34542930b3d8ec1dc1da634abc684197bb4717e1fd4356a90f57
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:592f06f7337b836b66cd80a06e6dc9e25ae533b97c6347eb9344f6ecddefa9aa
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d62a6477ae00126d4db2168c55367d80e8a6869ee2c0b32115e2f67ad7b45e3
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a081bc5da5ed0dc09d1d00741d1fe6bdeae12f8d58e5b4d44a7d78e0ad120f04
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b778b133577e8a02dcdd3364fe347ed16d67e4165e95d771fc0e88a64c881d14
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0340327868852457,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1408,6 +1408,286 @@
1408
  "learning_rate": 0.0002946479666971158,
1409
  "loss": 1.8817,
1410
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  }
1412
  ],
1413
  "logging_steps": 50,
@@ -1427,7 +1707,7 @@
1427
  "attributes": {}
1428
  }
1429
  },
1430
- "total_flos": 5.347720296331739e+18,
1431
  "train_batch_size": 16,
1432
  "trial_name": null,
1433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0182622950819673,
6
  "eval_steps": 500,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1408
  "learning_rate": 0.0002946479666971158,
1409
  "loss": 1.8817,
1410
  "step": 10000
1411
+ },
1412
+ {
1413
+ "epoch": 2.0345792349726777,
1414
+ "grad_norm": 0.50390625,
1415
+ "learning_rate": 0.0002945777330412184,
1416
+ "loss": 1.9465,
1417
+ "step": 10050
1418
+ },
1419
+ {
1420
+ "epoch": 2.035125683060109,
1421
+ "grad_norm": 0.5703125,
1422
+ "learning_rate": 0.00029450705003439156,
1423
+ "loss": 1.9628,
1424
+ "step": 10100
1425
+ },
1426
+ {
1427
+ "epoch": 2.035672131147541,
1428
+ "grad_norm": 0.52734375,
1429
+ "learning_rate": 0.0002944359178963198,
1430
+ "loss": 1.9421,
1431
+ "step": 10150
1432
+ },
1433
+ {
1434
+ "epoch": 2.0362185792349727,
1435
+ "grad_norm": 0.51171875,
1436
+ "learning_rate": 0.00029436433684808336,
1437
+ "loss": 1.953,
1438
+ "step": 10200
1439
+ },
1440
+ {
1441
+ "epoch": 2.0367650273224043,
1442
+ "grad_norm": 0.5390625,
1443
+ "learning_rate": 0.0002942923071121578,
1444
+ "loss": 1.9499,
1445
+ "step": 10250
1446
+ },
1447
+ {
1448
+ "epoch": 2.0373114754098363,
1449
+ "grad_norm": 0.53515625,
1450
+ "learning_rate": 0.0002942198289124132,
1451
+ "loss": 1.9522,
1452
+ "step": 10300
1453
+ },
1454
+ {
1455
+ "epoch": 3.000229508196721,
1456
+ "grad_norm": 0.5078125,
1457
+ "learning_rate": 0.00029414690247411346,
1458
+ "loss": 1.9537,
1459
+ "step": 10350
1460
+ },
1461
+ {
1462
+ "epoch": 3.000775956284153,
1463
+ "grad_norm": 0.5390625,
1464
+ "learning_rate": 0.0002940735280239157,
1465
+ "loss": 1.9224,
1466
+ "step": 10400
1467
+ },
1468
+ {
1469
+ "epoch": 3.0013224043715847,
1470
+ "grad_norm": 0.5234375,
1471
+ "learning_rate": 0.0002939997057898693,
1472
+ "loss": 1.9165,
1473
+ "step": 10450
1474
+ },
1475
+ {
1476
+ "epoch": 3.0018688524590162,
1477
+ "grad_norm": 0.490234375,
1478
+ "learning_rate": 0.0002939254360014156,
1479
+ "loss": 1.8814,
1480
+ "step": 10500
1481
+ },
1482
+ {
1483
+ "epoch": 3.0024153005464482,
1484
+ "grad_norm": 0.52734375,
1485
+ "learning_rate": 0.0002938507188893867,
1486
+ "loss": 1.8547,
1487
+ "step": 10550
1488
+ },
1489
+ {
1490
+ "epoch": 3.0029617486338798,
1491
+ "grad_norm": 0.8125,
1492
+ "learning_rate": 0.00029377555468600516,
1493
+ "loss": 1.9014,
1494
+ "step": 10600
1495
+ },
1496
+ {
1497
+ "epoch": 3.0035081967213113,
1498
+ "grad_norm": 0.8125,
1499
+ "learning_rate": 0.00029369994362488306,
1500
+ "loss": 1.8837,
1501
+ "step": 10650
1502
+ },
1503
+ {
1504
+ "epoch": 3.0040546448087433,
1505
+ "grad_norm": 0.51953125,
1506
+ "learning_rate": 0.0002936238859410213,
1507
+ "loss": 1.9095,
1508
+ "step": 10700
1509
+ },
1510
+ {
1511
+ "epoch": 3.004601092896175,
1512
+ "grad_norm": 0.52734375,
1513
+ "learning_rate": 0.0002935473818708089,
1514
+ "loss": 1.8654,
1515
+ "step": 10750
1516
+ },
1517
+ {
1518
+ "epoch": 3.0051475409836064,
1519
+ "grad_norm": 0.58203125,
1520
+ "learning_rate": 0.00029347043165202233,
1521
+ "loss": 1.9018,
1522
+ "step": 10800
1523
+ },
1524
+ {
1525
+ "epoch": 3.0056939890710384,
1526
+ "grad_norm": 0.51171875,
1527
+ "learning_rate": 0.0002933930355238246,
1528
+ "loss": 1.895,
1529
+ "step": 10850
1530
+ },
1531
+ {
1532
+ "epoch": 3.00624043715847,
1533
+ "grad_norm": 0.50390625,
1534
+ "learning_rate": 0.0002933151937267647,
1535
+ "loss": 1.8872,
1536
+ "step": 10900
1537
+ },
1538
+ {
1539
+ "epoch": 3.0067868852459014,
1540
+ "grad_norm": 0.53125,
1541
+ "learning_rate": 0.0002932369065027767,
1542
+ "loss": 1.8532,
1543
+ "step": 10950
1544
+ },
1545
+ {
1546
+ "epoch": 3.0073333333333334,
1547
+ "grad_norm": 0.546875,
1548
+ "learning_rate": 0.0002931581740951791,
1549
+ "loss": 1.8935,
1550
+ "step": 11000
1551
+ },
1552
+ {
1553
+ "epoch": 3.007879781420765,
1554
+ "grad_norm": 0.5234375,
1555
+ "learning_rate": 0.00029307899674867405,
1556
+ "loss": 1.8991,
1557
+ "step": 11050
1558
+ },
1559
+ {
1560
+ "epoch": 3.008426229508197,
1561
+ "grad_norm": 0.53515625,
1562
+ "learning_rate": 0.00029299937470934656,
1563
+ "loss": 1.8784,
1564
+ "step": 11100
1565
+ },
1566
+ {
1567
+ "epoch": 3.0089726775956285,
1568
+ "grad_norm": 0.52734375,
1569
+ "learning_rate": 0.00029291930822466383,
1570
+ "loss": 1.8775,
1571
+ "step": 11150
1572
+ },
1573
+ {
1574
+ "epoch": 3.00951912568306,
1575
+ "grad_norm": 0.56640625,
1576
+ "learning_rate": 0.0002928387975434742,
1577
+ "loss": 1.8874,
1578
+ "step": 11200
1579
+ },
1580
+ {
1581
+ "epoch": 3.010065573770492,
1582
+ "grad_norm": 0.59765625,
1583
+ "learning_rate": 0.00029275784291600684,
1584
+ "loss": 1.9137,
1585
+ "step": 11250
1586
+ },
1587
+ {
1588
+ "epoch": 3.0106120218579235,
1589
+ "grad_norm": 0.53125,
1590
+ "learning_rate": 0.0002926764445938705,
1591
+ "loss": 1.8568,
1592
+ "step": 11300
1593
+ },
1594
+ {
1595
+ "epoch": 3.011158469945355,
1596
+ "grad_norm": 0.53125,
1597
+ "learning_rate": 0.0002925946028300532,
1598
+ "loss": 1.8578,
1599
+ "step": 11350
1600
+ },
1601
+ {
1602
+ "epoch": 3.011704918032787,
1603
+ "grad_norm": 0.53515625,
1604
+ "learning_rate": 0.0002925123178789209,
1605
+ "loss": 1.9092,
1606
+ "step": 11400
1607
+ },
1608
+ {
1609
+ "epoch": 3.0122513661202186,
1610
+ "grad_norm": 0.515625,
1611
+ "learning_rate": 0.00029242958999621717,
1612
+ "loss": 1.8663,
1613
+ "step": 11450
1614
+ },
1615
+ {
1616
+ "epoch": 3.01279781420765,
1617
+ "grad_norm": 0.54296875,
1618
+ "learning_rate": 0.00029234641943906223,
1619
+ "loss": 1.862,
1620
+ "step": 11500
1621
+ },
1622
+ {
1623
+ "epoch": 3.013344262295082,
1624
+ "grad_norm": 0.56640625,
1625
+ "learning_rate": 0.0002922628064659519,
1626
+ "loss": 1.8594,
1627
+ "step": 11550
1628
+ },
1629
+ {
1630
+ "epoch": 3.0138907103825137,
1631
+ "grad_norm": 0.546875,
1632
+ "learning_rate": 0.0002921787513367575,
1633
+ "loss": 1.8633,
1634
+ "step": 11600
1635
+ },
1636
+ {
1637
+ "epoch": 3.014437158469945,
1638
+ "grad_norm": 0.51953125,
1639
+ "learning_rate": 0.0002920942543127241,
1640
+ "loss": 1.8929,
1641
+ "step": 11650
1642
+ },
1643
+ {
1644
+ "epoch": 3.014983606557377,
1645
+ "grad_norm": 0.54296875,
1646
+ "learning_rate": 0.0002920093156564705,
1647
+ "loss": 1.8794,
1648
+ "step": 11700
1649
+ },
1650
+ {
1651
+ "epoch": 3.0155300546448087,
1652
+ "grad_norm": 0.5234375,
1653
+ "learning_rate": 0.0002919239356319879,
1654
+ "loss": 1.8691,
1655
+ "step": 11750
1656
+ },
1657
+ {
1658
+ "epoch": 3.0160765027322403,
1659
+ "grad_norm": 0.53515625,
1660
+ "learning_rate": 0.00029183811450463954,
1661
+ "loss": 1.8429,
1662
+ "step": 11800
1663
+ },
1664
+ {
1665
+ "epoch": 3.0166229508196722,
1666
+ "grad_norm": 0.5,
1667
+ "learning_rate": 0.00029175185254115934,
1668
+ "loss": 1.8325,
1669
+ "step": 11850
1670
+ },
1671
+ {
1672
+ "epoch": 3.017169398907104,
1673
+ "grad_norm": 0.53125,
1674
+ "learning_rate": 0.00029166515000965154,
1675
+ "loss": 1.8598,
1676
+ "step": 11900
1677
+ },
1678
+ {
1679
+ "epoch": 3.0177158469945353,
1680
+ "grad_norm": 0.55078125,
1681
+ "learning_rate": 0.0002915780071795896,
1682
+ "loss": 1.8376,
1683
+ "step": 11950
1684
+ },
1685
+ {
1686
+ "epoch": 3.0182622950819673,
1687
+ "grad_norm": 0.57421875,
1688
+ "learning_rate": 0.0002914904243218154,
1689
+ "loss": 1.8142,
1690
+ "step": 12000
1691
  }
1692
  ],
1693
  "logging_steps": 50,
 
1707
  "attributes": {}
1708
  }
1709
  },
1710
+ "total_flos": 6.417354593302217e+18,
1711
  "train_batch_size": 16,
1712
  "trial_name": null,
1713
  "trial_params": null