CocoRoF commited on
Commit
7da49a1
·
verified ·
1 Parent(s): 9ad1482

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:617ff76d496774425760760c864d36a829690e93c00f300e5fd6a772bec23af2
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95abb4f35b22a7e758afd27aca2a69f419698ad1276e97da477eb487b8e37f1
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdd691842c9b3eecd0100adcd2271f6b8b162e162a40b960523591fe8491a784
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:048bbd1cabc0a45ad09ea3651c3b6e3d5d885845f3d358d903b243b17193ba3a
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c461c9d337dfc684e9352ec72bfa344e2f5d377f7cfc4475de9acae294dca89
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:079541ad505a3dc0e80239afe57cfe11e44b1de1d72b78fa14ede3d018356e24
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c784d1e14f175eecd1cb8a33bf7e3edbddb5399a3760000ee27c1d7309b565d
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2311754549460711,
5
  "eval_steps": 500,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1423,6 +1423,714 @@
1423
  "eval_samples_per_second": 609.845,
1424
  "eval_steps_per_second": 38.116,
1425
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1426
  }
1427
  ],
1428
  "logging_steps": 5,
@@ -1442,7 +2150,7 @@
1442
  "attributes": {}
1443
  }
1444
  },
1445
- "total_flos": 4.332357992788787e+18,
1446
  "train_batch_size": 4,
1447
  "trial_name": null,
1448
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.34676318241910664,
5
  "eval_steps": 500,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1423
  "eval_samples_per_second": 609.845,
1424
  "eval_steps_per_second": 38.116,
1425
  "step": 1000
1426
+ },
1427
+ {
1428
+ "epoch": 0.23233133222080146,
1429
+ "grad_norm": 166.75,
1430
+ "learning_rate": 8.530318602261049e-06,
1431
+ "loss": 72.1535,
1432
+ "step": 1005
1433
+ },
1434
+ {
1435
+ "epoch": 0.2334872094955318,
1436
+ "grad_norm": 178.5,
1437
+ "learning_rate": 8.517471736896197e-06,
1438
+ "loss": 74.0213,
1439
+ "step": 1010
1440
+ },
1441
+ {
1442
+ "epoch": 0.23464308677026216,
1443
+ "grad_norm": 156.25,
1444
+ "learning_rate": 8.504624871531348e-06,
1445
+ "loss": 74.9422,
1446
+ "step": 1015
1447
+ },
1448
+ {
1449
+ "epoch": 0.2357989640449925,
1450
+ "grad_norm": 174.0,
1451
+ "learning_rate": 8.491778006166495e-06,
1452
+ "loss": 72.5082,
1453
+ "step": 1020
1454
+ },
1455
+ {
1456
+ "epoch": 0.2369548413197229,
1457
+ "grad_norm": 162.5,
1458
+ "learning_rate": 8.478931140801646e-06,
1459
+ "loss": 73.302,
1460
+ "step": 1025
1461
+ },
1462
+ {
1463
+ "epoch": 0.23811071859445324,
1464
+ "grad_norm": 182.625,
1465
+ "learning_rate": 8.466084275436795e-06,
1466
+ "loss": 73.0291,
1467
+ "step": 1030
1468
+ },
1469
+ {
1470
+ "epoch": 0.23926659586918358,
1471
+ "grad_norm": 162.875,
1472
+ "learning_rate": 8.453237410071943e-06,
1473
+ "loss": 73.237,
1474
+ "step": 1035
1475
+ },
1476
+ {
1477
+ "epoch": 0.24042247314391393,
1478
+ "grad_norm": 181.75,
1479
+ "learning_rate": 8.440390544707092e-06,
1480
+ "loss": 73.0543,
1481
+ "step": 1040
1482
+ },
1483
+ {
1484
+ "epoch": 0.2415783504186443,
1485
+ "grad_norm": 194.25,
1486
+ "learning_rate": 8.427543679342241e-06,
1487
+ "loss": 72.1511,
1488
+ "step": 1045
1489
+ },
1490
+ {
1491
+ "epoch": 0.24273422769337466,
1492
+ "grad_norm": 197.625,
1493
+ "learning_rate": 8.41469681397739e-06,
1494
+ "loss": 72.7044,
1495
+ "step": 1050
1496
+ },
1497
+ {
1498
+ "epoch": 0.243890104968105,
1499
+ "grad_norm": 205.75,
1500
+ "learning_rate": 8.401849948612539e-06,
1501
+ "loss": 73.4772,
1502
+ "step": 1055
1503
+ },
1504
+ {
1505
+ "epoch": 0.24504598224283536,
1506
+ "grad_norm": 179.375,
1507
+ "learning_rate": 8.38900308324769e-06,
1508
+ "loss": 73.4765,
1509
+ "step": 1060
1510
+ },
1511
+ {
1512
+ "epoch": 0.24620185951756574,
1513
+ "grad_norm": 163.25,
1514
+ "learning_rate": 8.376156217882836e-06,
1515
+ "loss": 71.4076,
1516
+ "step": 1065
1517
+ },
1518
+ {
1519
+ "epoch": 0.24735773679229608,
1520
+ "grad_norm": 159.75,
1521
+ "learning_rate": 8.363309352517987e-06,
1522
+ "loss": 74.3268,
1523
+ "step": 1070
1524
+ },
1525
+ {
1526
+ "epoch": 0.24851361406702643,
1527
+ "grad_norm": 174.625,
1528
+ "learning_rate": 8.350462487153134e-06,
1529
+ "loss": 71.9133,
1530
+ "step": 1075
1531
+ },
1532
+ {
1533
+ "epoch": 0.24966949134175678,
1534
+ "grad_norm": 224.125,
1535
+ "learning_rate": 8.337615621788285e-06,
1536
+ "loss": 72.5681,
1537
+ "step": 1080
1538
+ },
1539
+ {
1540
+ "epoch": 0.25082536861648713,
1541
+ "grad_norm": 196.25,
1542
+ "learning_rate": 8.324768756423434e-06,
1543
+ "loss": 72.7169,
1544
+ "step": 1085
1545
+ },
1546
+ {
1547
+ "epoch": 0.2519812458912175,
1548
+ "grad_norm": 163.5,
1549
+ "learning_rate": 8.311921891058582e-06,
1550
+ "loss": 73.2148,
1551
+ "step": 1090
1552
+ },
1553
+ {
1554
+ "epoch": 0.25313712316594783,
1555
+ "grad_norm": 181.5,
1556
+ "learning_rate": 8.299075025693731e-06,
1557
+ "loss": 74.4619,
1558
+ "step": 1095
1559
+ },
1560
+ {
1561
+ "epoch": 0.2542930004406782,
1562
+ "grad_norm": 201.125,
1563
+ "learning_rate": 8.28622816032888e-06,
1564
+ "loss": 72.7498,
1565
+ "step": 1100
1566
+ },
1567
+ {
1568
+ "epoch": 0.2554488777154086,
1569
+ "grad_norm": 191.75,
1570
+ "learning_rate": 8.273381294964029e-06,
1571
+ "loss": 73.7849,
1572
+ "step": 1105
1573
+ },
1574
+ {
1575
+ "epoch": 0.2566047549901389,
1576
+ "grad_norm": 187.375,
1577
+ "learning_rate": 8.260534429599178e-06,
1578
+ "loss": 73.536,
1579
+ "step": 1110
1580
+ },
1581
+ {
1582
+ "epoch": 0.2577606322648693,
1583
+ "grad_norm": 161.25,
1584
+ "learning_rate": 8.247687564234328e-06,
1585
+ "loss": 73.2744,
1586
+ "step": 1115
1587
+ },
1588
+ {
1589
+ "epoch": 0.25891650953959966,
1590
+ "grad_norm": 170.875,
1591
+ "learning_rate": 8.234840698869475e-06,
1592
+ "loss": 74.2862,
1593
+ "step": 1120
1594
+ },
1595
+ {
1596
+ "epoch": 0.26007238681433,
1597
+ "grad_norm": 175.375,
1598
+ "learning_rate": 8.221993833504626e-06,
1599
+ "loss": 73.767,
1600
+ "step": 1125
1601
+ },
1602
+ {
1603
+ "epoch": 0.26122826408906036,
1604
+ "grad_norm": 175.25,
1605
+ "learning_rate": 8.209146968139775e-06,
1606
+ "loss": 72.3037,
1607
+ "step": 1130
1608
+ },
1609
+ {
1610
+ "epoch": 0.2623841413637907,
1611
+ "grad_norm": 180.125,
1612
+ "learning_rate": 8.196300102774924e-06,
1613
+ "loss": 73.3881,
1614
+ "step": 1135
1615
+ },
1616
+ {
1617
+ "epoch": 0.26354001863852106,
1618
+ "grad_norm": 165.25,
1619
+ "learning_rate": 8.183453237410073e-06,
1620
+ "loss": 74.2604,
1621
+ "step": 1140
1622
+ },
1623
+ {
1624
+ "epoch": 0.26469589591325143,
1625
+ "grad_norm": 208.875,
1626
+ "learning_rate": 8.170606372045221e-06,
1627
+ "loss": 73.6266,
1628
+ "step": 1145
1629
+ },
1630
+ {
1631
+ "epoch": 0.26585177318798175,
1632
+ "grad_norm": 164.375,
1633
+ "learning_rate": 8.15775950668037e-06,
1634
+ "loss": 71.7306,
1635
+ "step": 1150
1636
+ },
1637
+ {
1638
+ "epoch": 0.26700765046271213,
1639
+ "grad_norm": 180.5,
1640
+ "learning_rate": 8.14491264131552e-06,
1641
+ "loss": 73.3496,
1642
+ "step": 1155
1643
+ },
1644
+ {
1645
+ "epoch": 0.26816352773744245,
1646
+ "grad_norm": 161.75,
1647
+ "learning_rate": 8.132065775950668e-06,
1648
+ "loss": 71.9542,
1649
+ "step": 1160
1650
+ },
1651
+ {
1652
+ "epoch": 0.26931940501217283,
1653
+ "grad_norm": 160.375,
1654
+ "learning_rate": 8.119218910585818e-06,
1655
+ "loss": 72.852,
1656
+ "step": 1165
1657
+ },
1658
+ {
1659
+ "epoch": 0.2704752822869032,
1660
+ "grad_norm": 166.875,
1661
+ "learning_rate": 8.106372045220967e-06,
1662
+ "loss": 73.498,
1663
+ "step": 1170
1664
+ },
1665
+ {
1666
+ "epoch": 0.27163115956163353,
1667
+ "grad_norm": 152.5,
1668
+ "learning_rate": 8.093525179856116e-06,
1669
+ "loss": 72.89,
1670
+ "step": 1175
1671
+ },
1672
+ {
1673
+ "epoch": 0.2727870368363639,
1674
+ "grad_norm": 179.625,
1675
+ "learning_rate": 8.080678314491265e-06,
1676
+ "loss": 73.7572,
1677
+ "step": 1180
1678
+ },
1679
+ {
1680
+ "epoch": 0.2739429141110943,
1681
+ "grad_norm": 178.625,
1682
+ "learning_rate": 8.067831449126414e-06,
1683
+ "loss": 72.1453,
1684
+ "step": 1185
1685
+ },
1686
+ {
1687
+ "epoch": 0.2750987913858246,
1688
+ "grad_norm": 165.0,
1689
+ "learning_rate": 8.054984583761563e-06,
1690
+ "loss": 73.9004,
1691
+ "step": 1190
1692
+ },
1693
+ {
1694
+ "epoch": 0.276254668660555,
1695
+ "grad_norm": 170.875,
1696
+ "learning_rate": 8.042137718396712e-06,
1697
+ "loss": 72.7976,
1698
+ "step": 1195
1699
+ },
1700
+ {
1701
+ "epoch": 0.2774105459352853,
1702
+ "grad_norm": 163.125,
1703
+ "learning_rate": 8.029290853031862e-06,
1704
+ "loss": 73.1718,
1705
+ "step": 1200
1706
+ },
1707
+ {
1708
+ "epoch": 0.2785664232100157,
1709
+ "grad_norm": 179.25,
1710
+ "learning_rate": 8.01644398766701e-06,
1711
+ "loss": 72.7056,
1712
+ "step": 1205
1713
+ },
1714
+ {
1715
+ "epoch": 0.27972230048474606,
1716
+ "grad_norm": 190.625,
1717
+ "learning_rate": 8.00359712230216e-06,
1718
+ "loss": 73.2504,
1719
+ "step": 1210
1720
+ },
1721
+ {
1722
+ "epoch": 0.2808781777594764,
1723
+ "grad_norm": 171.375,
1724
+ "learning_rate": 7.990750256937307e-06,
1725
+ "loss": 72.886,
1726
+ "step": 1215
1727
+ },
1728
+ {
1729
+ "epoch": 0.28203405503420675,
1730
+ "grad_norm": 168.25,
1731
+ "learning_rate": 7.977903391572457e-06,
1732
+ "loss": 73.0636,
1733
+ "step": 1220
1734
+ },
1735
+ {
1736
+ "epoch": 0.2831899323089371,
1737
+ "grad_norm": 170.875,
1738
+ "learning_rate": 7.965056526207606e-06,
1739
+ "loss": 73.4683,
1740
+ "step": 1225
1741
+ },
1742
+ {
1743
+ "epoch": 0.28434580958366745,
1744
+ "grad_norm": 170.375,
1745
+ "learning_rate": 7.952209660842755e-06,
1746
+ "loss": 72.7721,
1747
+ "step": 1230
1748
+ },
1749
+ {
1750
+ "epoch": 0.28550168685839783,
1751
+ "grad_norm": 187.875,
1752
+ "learning_rate": 7.939362795477904e-06,
1753
+ "loss": 74.0967,
1754
+ "step": 1235
1755
+ },
1756
+ {
1757
+ "epoch": 0.28665756413312815,
1758
+ "grad_norm": 206.0,
1759
+ "learning_rate": 7.926515930113053e-06,
1760
+ "loss": 72.2294,
1761
+ "step": 1240
1762
+ },
1763
+ {
1764
+ "epoch": 0.28781344140785853,
1765
+ "grad_norm": 174.625,
1766
+ "learning_rate": 7.913669064748202e-06,
1767
+ "loss": 71.7166,
1768
+ "step": 1245
1769
+ },
1770
+ {
1771
+ "epoch": 0.28896931868258885,
1772
+ "grad_norm": 195.0,
1773
+ "learning_rate": 7.90082219938335e-06,
1774
+ "loss": 72.4936,
1775
+ "step": 1250
1776
+ },
1777
+ {
1778
+ "epoch": 0.2901251959573192,
1779
+ "grad_norm": 174.25,
1780
+ "learning_rate": 7.887975334018501e-06,
1781
+ "loss": 71.8098,
1782
+ "step": 1255
1783
+ },
1784
+ {
1785
+ "epoch": 0.2912810732320496,
1786
+ "grad_norm": 162.25,
1787
+ "learning_rate": 7.875128468653648e-06,
1788
+ "loss": 73.6741,
1789
+ "step": 1260
1790
+ },
1791
+ {
1792
+ "epoch": 0.2924369505067799,
1793
+ "grad_norm": 163.0,
1794
+ "learning_rate": 7.862281603288799e-06,
1795
+ "loss": 74.2419,
1796
+ "step": 1265
1797
+ },
1798
+ {
1799
+ "epoch": 0.2935928277815103,
1800
+ "grad_norm": 156.625,
1801
+ "learning_rate": 7.849434737923948e-06,
1802
+ "loss": 71.8441,
1803
+ "step": 1270
1804
+ },
1805
+ {
1806
+ "epoch": 0.2947487050562407,
1807
+ "grad_norm": 159.125,
1808
+ "learning_rate": 7.836587872559096e-06,
1809
+ "loss": 72.4638,
1810
+ "step": 1275
1811
+ },
1812
+ {
1813
+ "epoch": 0.295904582330971,
1814
+ "grad_norm": 163.75,
1815
+ "learning_rate": 7.823741007194245e-06,
1816
+ "loss": 72.4716,
1817
+ "step": 1280
1818
+ },
1819
+ {
1820
+ "epoch": 0.2970604596057014,
1821
+ "grad_norm": 149.25,
1822
+ "learning_rate": 7.810894141829394e-06,
1823
+ "loss": 71.0848,
1824
+ "step": 1285
1825
+ },
1826
+ {
1827
+ "epoch": 0.2982163368804317,
1828
+ "grad_norm": 157.625,
1829
+ "learning_rate": 7.798047276464543e-06,
1830
+ "loss": 71.6886,
1831
+ "step": 1290
1832
+ },
1833
+ {
1834
+ "epoch": 0.2993722141551621,
1835
+ "grad_norm": 161.125,
1836
+ "learning_rate": 7.785200411099692e-06,
1837
+ "loss": 72.6646,
1838
+ "step": 1295
1839
+ },
1840
+ {
1841
+ "epoch": 0.30052809142989245,
1842
+ "grad_norm": 189.25,
1843
+ "learning_rate": 7.77235354573484e-06,
1844
+ "loss": 72.4807,
1845
+ "step": 1300
1846
+ },
1847
+ {
1848
+ "epoch": 0.3016839687046228,
1849
+ "grad_norm": 207.0,
1850
+ "learning_rate": 7.75950668036999e-06,
1851
+ "loss": 72.7943,
1852
+ "step": 1305
1853
+ },
1854
+ {
1855
+ "epoch": 0.30283984597935315,
1856
+ "grad_norm": 160.625,
1857
+ "learning_rate": 7.74665981500514e-06,
1858
+ "loss": 72.9901,
1859
+ "step": 1310
1860
+ },
1861
+ {
1862
+ "epoch": 0.30399572325408347,
1863
+ "grad_norm": 218.375,
1864
+ "learning_rate": 7.733812949640287e-06,
1865
+ "loss": 73.4908,
1866
+ "step": 1315
1867
+ },
1868
+ {
1869
+ "epoch": 0.30515160052881385,
1870
+ "grad_norm": 166.125,
1871
+ "learning_rate": 7.720966084275438e-06,
1872
+ "loss": 72.9456,
1873
+ "step": 1320
1874
+ },
1875
+ {
1876
+ "epoch": 0.3063074778035442,
1877
+ "grad_norm": 171.0,
1878
+ "learning_rate": 7.708119218910587e-06,
1879
+ "loss": 72.6809,
1880
+ "step": 1325
1881
+ },
1882
+ {
1883
+ "epoch": 0.30746335507827455,
1884
+ "grad_norm": 162.75,
1885
+ "learning_rate": 7.695272353545736e-06,
1886
+ "loss": 71.7408,
1887
+ "step": 1330
1888
+ },
1889
+ {
1890
+ "epoch": 0.3086192323530049,
1891
+ "grad_norm": 155.875,
1892
+ "learning_rate": 7.682425488180884e-06,
1893
+ "loss": 73.5375,
1894
+ "step": 1335
1895
+ },
1896
+ {
1897
+ "epoch": 0.3097751096277353,
1898
+ "grad_norm": 155.625,
1899
+ "learning_rate": 7.669578622816033e-06,
1900
+ "loss": 72.8354,
1901
+ "step": 1340
1902
+ },
1903
+ {
1904
+ "epoch": 0.3109309869024656,
1905
+ "grad_norm": 176.75,
1906
+ "learning_rate": 7.656731757451182e-06,
1907
+ "loss": 72.5769,
1908
+ "step": 1345
1909
+ },
1910
+ {
1911
+ "epoch": 0.312086864177196,
1912
+ "grad_norm": 187.75,
1913
+ "learning_rate": 7.643884892086333e-06,
1914
+ "loss": 72.8171,
1915
+ "step": 1350
1916
+ },
1917
+ {
1918
+ "epoch": 0.3132427414519263,
1919
+ "grad_norm": 159.25,
1920
+ "learning_rate": 7.63103802672148e-06,
1921
+ "loss": 72.2052,
1922
+ "step": 1355
1923
+ },
1924
+ {
1925
+ "epoch": 0.3143986187266567,
1926
+ "grad_norm": 163.125,
1927
+ "learning_rate": 7.6181911613566294e-06,
1928
+ "loss": 71.9153,
1929
+ "step": 1360
1930
+ },
1931
+ {
1932
+ "epoch": 0.3155544960013871,
1933
+ "grad_norm": 171.375,
1934
+ "learning_rate": 7.605344295991779e-06,
1935
+ "loss": 70.9257,
1936
+ "step": 1365
1937
+ },
1938
+ {
1939
+ "epoch": 0.3167103732761174,
1940
+ "grad_norm": 180.0,
1941
+ "learning_rate": 7.592497430626927e-06,
1942
+ "loss": 72.8817,
1943
+ "step": 1370
1944
+ },
1945
+ {
1946
+ "epoch": 0.3178662505508478,
1947
+ "grad_norm": 173.625,
1948
+ "learning_rate": 7.579650565262077e-06,
1949
+ "loss": 72.5966,
1950
+ "step": 1375
1951
+ },
1952
+ {
1953
+ "epoch": 0.3190221278255781,
1954
+ "grad_norm": 160.125,
1955
+ "learning_rate": 7.566803699897226e-06,
1956
+ "loss": 72.0935,
1957
+ "step": 1380
1958
+ },
1959
+ {
1960
+ "epoch": 0.32017800510030847,
1961
+ "grad_norm": 158.875,
1962
+ "learning_rate": 7.5539568345323745e-06,
1963
+ "loss": 72.1024,
1964
+ "step": 1385
1965
+ },
1966
+ {
1967
+ "epoch": 0.32133388237503885,
1968
+ "grad_norm": 155.375,
1969
+ "learning_rate": 7.541109969167524e-06,
1970
+ "loss": 71.8397,
1971
+ "step": 1390
1972
+ },
1973
+ {
1974
+ "epoch": 0.32248975964976917,
1975
+ "grad_norm": 165.5,
1976
+ "learning_rate": 7.528263103802673e-06,
1977
+ "loss": 71.5119,
1978
+ "step": 1395
1979
+ },
1980
+ {
1981
+ "epoch": 0.32364563692449955,
1982
+ "grad_norm": 170.875,
1983
+ "learning_rate": 7.515416238437822e-06,
1984
+ "loss": 73.3594,
1985
+ "step": 1400
1986
+ },
1987
+ {
1988
+ "epoch": 0.3248015141992299,
1989
+ "grad_norm": 184.125,
1990
+ "learning_rate": 7.502569373072971e-06,
1991
+ "loss": 73.504,
1992
+ "step": 1405
1993
+ },
1994
+ {
1995
+ "epoch": 0.32595739147396025,
1996
+ "grad_norm": 165.25,
1997
+ "learning_rate": 7.4897225077081205e-06,
1998
+ "loss": 71.1564,
1999
+ "step": 1410
2000
+ },
2001
+ {
2002
+ "epoch": 0.3271132687486906,
2003
+ "grad_norm": 175.5,
2004
+ "learning_rate": 7.4768756423432685e-06,
2005
+ "loss": 72.8789,
2006
+ "step": 1415
2007
+ },
2008
+ {
2009
+ "epoch": 0.32826914602342094,
2010
+ "grad_norm": 168.375,
2011
+ "learning_rate": 7.464028776978418e-06,
2012
+ "loss": 73.2979,
2013
+ "step": 1420
2014
+ },
2015
+ {
2016
+ "epoch": 0.3294250232981513,
2017
+ "grad_norm": 177.5,
2018
+ "learning_rate": 7.451181911613566e-06,
2019
+ "loss": 71.187,
2020
+ "step": 1425
2021
+ },
2022
+ {
2023
+ "epoch": 0.3305809005728817,
2024
+ "grad_norm": 160.375,
2025
+ "learning_rate": 7.438335046248716e-06,
2026
+ "loss": 73.9314,
2027
+ "step": 1430
2028
+ },
2029
+ {
2030
+ "epoch": 0.331736777847612,
2031
+ "grad_norm": 167.625,
2032
+ "learning_rate": 7.4254881808838655e-06,
2033
+ "loss": 72.4073,
2034
+ "step": 1435
2035
+ },
2036
+ {
2037
+ "epoch": 0.3328926551223424,
2038
+ "grad_norm": 167.0,
2039
+ "learning_rate": 7.4126413155190135e-06,
2040
+ "loss": 72.5102,
2041
+ "step": 1440
2042
+ },
2043
+ {
2044
+ "epoch": 0.3340485323970727,
2045
+ "grad_norm": 173.5,
2046
+ "learning_rate": 7.399794450154163e-06,
2047
+ "loss": 72.4247,
2048
+ "step": 1445
2049
+ },
2050
+ {
2051
+ "epoch": 0.3352044096718031,
2052
+ "grad_norm": 169.125,
2053
+ "learning_rate": 7.386947584789312e-06,
2054
+ "loss": 71.9371,
2055
+ "step": 1450
2056
+ },
2057
+ {
2058
+ "epoch": 0.33636028694653347,
2059
+ "grad_norm": 157.375,
2060
+ "learning_rate": 7.374100719424461e-06,
2061
+ "loss": 71.7102,
2062
+ "step": 1455
2063
+ },
2064
+ {
2065
+ "epoch": 0.3375161642212638,
2066
+ "grad_norm": 173.625,
2067
+ "learning_rate": 7.36125385405961e-06,
2068
+ "loss": 72.5739,
2069
+ "step": 1460
2070
+ },
2071
+ {
2072
+ "epoch": 0.33867204149599417,
2073
+ "grad_norm": 196.125,
2074
+ "learning_rate": 7.3484069886947595e-06,
2075
+ "loss": 72.1263,
2076
+ "step": 1465
2077
+ },
2078
+ {
2079
+ "epoch": 0.3398279187707245,
2080
+ "grad_norm": 154.125,
2081
+ "learning_rate": 7.3355601233299075e-06,
2082
+ "loss": 72.4053,
2083
+ "step": 1470
2084
+ },
2085
+ {
2086
+ "epoch": 0.34098379604545487,
2087
+ "grad_norm": 182.125,
2088
+ "learning_rate": 7.322713257965057e-06,
2089
+ "loss": 72.6574,
2090
+ "step": 1475
2091
+ },
2092
+ {
2093
+ "epoch": 0.34213967332018524,
2094
+ "grad_norm": 182.375,
2095
+ "learning_rate": 7.309866392600207e-06,
2096
+ "loss": 72.0569,
2097
+ "step": 1480
2098
+ },
2099
+ {
2100
+ "epoch": 0.34329555059491557,
2101
+ "grad_norm": 161.375,
2102
+ "learning_rate": 7.297019527235355e-06,
2103
+ "loss": 72.4482,
2104
+ "step": 1485
2105
+ },
2106
+ {
2107
+ "epoch": 0.34445142786964594,
2108
+ "grad_norm": 165.25,
2109
+ "learning_rate": 7.2841726618705045e-06,
2110
+ "loss": 73.3294,
2111
+ "step": 1490
2112
+ },
2113
+ {
2114
+ "epoch": 0.3456073051443763,
2115
+ "grad_norm": 162.625,
2116
+ "learning_rate": 7.2713257965056525e-06,
2117
+ "loss": 72.414,
2118
+ "step": 1495
2119
+ },
2120
+ {
2121
+ "epoch": 0.34676318241910664,
2122
+ "grad_norm": 165.875,
2123
+ "learning_rate": 7.258478931140802e-06,
2124
+ "loss": 70.1396,
2125
+ "step": 1500
2126
+ },
2127
+ {
2128
+ "epoch": 0.34676318241910664,
2129
+ "eval_loss": NaN,
2130
+ "eval_runtime": 382.2578,
2131
+ "eval_samples_per_second": 609.887,
2132
+ "eval_steps_per_second": 38.118,
2133
+ "step": 1500
2134
  }
2135
  ],
2136
  "logging_steps": 5,
 
2150
  "attributes": {}
2151
  }
2152
  },
2153
+ "total_flos": 6.498536989183181e+18,
2154
  "train_batch_size": 4,
2155
  "trial_name": null,
2156
  "trial_params": null