NBAmine commited on
Commit
fc289ba
·
verified ·
1 Parent(s): af4df02

Training in progress, epoch 4, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -29,12 +29,12 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "down_proj",
33
- "q_proj",
34
  "o_proj",
35
  "k_proj",
36
- "up_proj",
37
  "v_proj",
 
38
  "gate_proj"
39
  ],
40
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "up_proj",
33
  "down_proj",
 
34
  "o_proj",
35
  "k_proj",
 
36
  "v_proj",
37
+ "q_proj",
38
  "gate_proj"
39
  ],
40
  "target_parameters": null,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42057458be0849df210a2b4c2241429197465f786f00b0c91791a8239fe63ce0
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fb6a48527f0c2bf807f44a8a6c8d4802eed8c0ba13fcee79401bb0186b7abe
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:486a933a9db49920bfa89b88f3df33a30e37dd2e0d00f86eab85749749cfb1cd
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09180e4c7321493fa69e3325d0daca10f06fcfe93360c6ea07fc029705406cd2
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9624fb715f3fe663fa916439122fcd0c3a8e903cf9047d070921678e351f1695
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c42920b30e520a675979a95992aac8814b12873171610ee1340a8766fb972bdc
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bc94b43794521e81946badd820ca495ec5676bcf0035e98e623d3832e5330ab
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e52e5e1cab4eb2aaf089a2bd08f96b02fc9c9dc390e79e6248ae6cd8a7e48f8
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57dbcaa4c36dfe8b1884cd38afdda1f50d97d5b0660c412d604e987f28a13d71
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a8c29a421e8a95f2d6d46e5ac0aa25be56966681afe38a5d47f15222c56ec6b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 1314,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1354,6 +1354,458 @@
1354
  "eval_samples_per_second": 1.318,
1355
  "eval_steps_per_second": 0.33,
1356
  "step": 1314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1357
  }
1358
  ],
1359
  "logging_steps": 10,
@@ -1373,7 +1825,7 @@
1373
  "attributes": {}
1374
  }
1375
  },
1376
- "total_flos": 5.688939372905779e+17,
1377
  "train_batch_size": 1,
1378
  "trial_name": null,
1379
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
  "eval_steps": 500,
7
+ "global_step": 1752,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1354
  "eval_samples_per_second": 1.318,
1355
  "eval_steps_per_second": 0.33,
1356
  "step": 1314
1357
+ },
1358
+ {
1359
+ "entropy": 0.05517864182669049,
1360
+ "epoch": 3.013714285714286,
1361
+ "grad_norm": 0.515275776386261,
1362
+ "learning_rate": 3.9863013698630135e-06,
1363
+ "loss": 0.0487,
1364
+ "mean_token_accuracy": 0.9875749964267015,
1365
+ "num_tokens": 106699.0,
1366
+ "step": 1320
1367
+ },
1368
+ {
1369
+ "entropy": 0.08493794328533114,
1370
+ "epoch": 3.0365714285714285,
1371
+ "grad_norm": 0.3127270042896271,
1372
+ "learning_rate": 3.940639269406393e-06,
1373
+ "loss": 0.077,
1374
+ "mean_token_accuracy": 0.9797436378896236,
1375
+ "num_tokens": 199088.0,
1376
+ "step": 1330
1377
+ },
1378
+ {
1379
+ "entropy": 0.12270754701457917,
1380
+ "epoch": 3.0594285714285716,
1381
+ "grad_norm": 0.42502084374427795,
1382
+ "learning_rate": 3.8949771689497714e-06,
1383
+ "loss": 0.1128,
1384
+ "mean_token_accuracy": 0.9702239688485861,
1385
+ "num_tokens": 252321.0,
1386
+ "step": 1340
1387
+ },
1388
+ {
1389
+ "entropy": 0.15497879879549145,
1390
+ "epoch": 3.0822857142857143,
1391
+ "grad_norm": 0.4974645674228668,
1392
+ "learning_rate": 3.849315068493151e-06,
1393
+ "loss": 0.1455,
1394
+ "mean_token_accuracy": 0.9595650866627693,
1395
+ "num_tokens": 287508.0,
1396
+ "step": 1350
1397
+ },
1398
+ {
1399
+ "entropy": 0.1917463649995625,
1400
+ "epoch": 3.105142857142857,
1401
+ "grad_norm": 0.6485738754272461,
1402
+ "learning_rate": 3.8036529680365297e-06,
1403
+ "loss": 0.1756,
1404
+ "mean_token_accuracy": 0.9525706138461828,
1405
+ "num_tokens": 312832.0,
1406
+ "step": 1360
1407
+ },
1408
+ {
1409
+ "entropy": 0.12080624285154044,
1410
+ "epoch": 3.128,
1411
+ "grad_norm": 0.23594826459884644,
1412
+ "learning_rate": 3.7579908675799087e-06,
1413
+ "loss": 0.1067,
1414
+ "mean_token_accuracy": 0.9709519907832146,
1415
+ "num_tokens": 413268.0,
1416
+ "step": 1370
1417
+ },
1418
+ {
1419
+ "entropy": 0.08804343957453967,
1420
+ "epoch": 3.150857142857143,
1421
+ "grad_norm": 0.2645934522151947,
1422
+ "learning_rate": 3.7123287671232876e-06,
1423
+ "loss": 0.0805,
1424
+ "mean_token_accuracy": 0.9789596509188414,
1425
+ "num_tokens": 497620.0,
1426
+ "step": 1380
1427
+ },
1428
+ {
1429
+ "entropy": 0.11698034470900893,
1430
+ "epoch": 3.1737142857142855,
1431
+ "grad_norm": 0.40339401364326477,
1432
+ "learning_rate": 3.6666666666666666e-06,
1433
+ "loss": 0.1083,
1434
+ "mean_token_accuracy": 0.9707631807774305,
1435
+ "num_tokens": 551786.0,
1436
+ "step": 1390
1437
+ },
1438
+ {
1439
+ "entropy": 0.15082112224772573,
1440
+ "epoch": 3.1965714285714286,
1441
+ "grad_norm": 0.45658695697784424,
1442
+ "learning_rate": 3.6210045662100455e-06,
1443
+ "loss": 0.1421,
1444
+ "mean_token_accuracy": 0.9619044814258814,
1445
+ "num_tokens": 589055.0,
1446
+ "step": 1400
1447
+ },
1448
+ {
1449
+ "entropy": 0.18135957256890833,
1450
+ "epoch": 3.2194285714285713,
1451
+ "grad_norm": 0.6109932661056519,
1452
+ "learning_rate": 3.575342465753425e-06,
1453
+ "loss": 0.1674,
1454
+ "mean_token_accuracy": 0.9547197036445141,
1455
+ "num_tokens": 616792.0,
1456
+ "step": 1410
1457
+ },
1458
+ {
1459
+ "entropy": 0.1157609753194265,
1460
+ "epoch": 3.2422857142857144,
1461
+ "grad_norm": 0.2772703766822815,
1462
+ "learning_rate": 3.529680365296804e-06,
1463
+ "loss": 0.1024,
1464
+ "mean_token_accuracy": 0.9713719986379147,
1465
+ "num_tokens": 725086.0,
1466
+ "step": 1420
1467
+ },
1468
+ {
1469
+ "entropy": 0.08992256266064942,
1470
+ "epoch": 3.265142857142857,
1471
+ "grad_norm": 0.35438141226768494,
1472
+ "learning_rate": 3.4840182648401828e-06,
1473
+ "loss": 0.0849,
1474
+ "mean_token_accuracy": 0.9776469606906175,
1475
+ "num_tokens": 810329.0,
1476
+ "step": 1430
1477
+ },
1478
+ {
1479
+ "entropy": 0.12669551267754287,
1480
+ "epoch": 3.288,
1481
+ "grad_norm": 0.5349701046943665,
1482
+ "learning_rate": 3.4383561643835617e-06,
1483
+ "loss": 0.12,
1484
+ "mean_token_accuracy": 0.9680649910122157,
1485
+ "num_tokens": 862174.0,
1486
+ "step": 1440
1487
+ },
1488
+ {
1489
+ "entropy": 0.15726197781041265,
1490
+ "epoch": 3.310857142857143,
1491
+ "grad_norm": 0.554969847202301,
1492
+ "learning_rate": 3.3926940639269407e-06,
1493
+ "loss": 0.1454,
1494
+ "mean_token_accuracy": 0.9603239048272372,
1495
+ "num_tokens": 898631.0,
1496
+ "step": 1450
1497
+ },
1498
+ {
1499
+ "entropy": 0.18559287013486028,
1500
+ "epoch": 3.3337142857142856,
1501
+ "grad_norm": 0.6043559312820435,
1502
+ "learning_rate": 3.3470319634703196e-06,
1503
+ "loss": 0.1712,
1504
+ "mean_token_accuracy": 0.9533442974090576,
1505
+ "num_tokens": 925437.0,
1506
+ "step": 1460
1507
+ },
1508
+ {
1509
+ "entropy": 0.11834120823768898,
1510
+ "epoch": 3.3565714285714288,
1511
+ "grad_norm": 0.21197669208049774,
1512
+ "learning_rate": 3.3013698630136985e-06,
1513
+ "loss": 0.1045,
1514
+ "mean_token_accuracy": 0.9706813614815474,
1515
+ "num_tokens": 1023847.0,
1516
+ "step": 1470
1517
+ },
1518
+ {
1519
+ "entropy": 0.08211489983368665,
1520
+ "epoch": 3.3794285714285714,
1521
+ "grad_norm": 0.3356609344482422,
1522
+ "learning_rate": 3.2557077625570775e-06,
1523
+ "loss": 0.0779,
1524
+ "mean_token_accuracy": 0.9798738922923803,
1525
+ "num_tokens": 1110782.0,
1526
+ "step": 1480
1527
+ },
1528
+ {
1529
+ "entropy": 0.1178143423749134,
1530
+ "epoch": 3.402285714285714,
1531
+ "grad_norm": 0.4783886671066284,
1532
+ "learning_rate": 3.210045662100457e-06,
1533
+ "loss": 0.1093,
1534
+ "mean_token_accuracy": 0.970770888775587,
1535
+ "num_tokens": 1165952.0,
1536
+ "step": 1490
1537
+ },
1538
+ {
1539
+ "entropy": 0.15600966215133666,
1540
+ "epoch": 3.4251428571428573,
1541
+ "grad_norm": 0.5326858758926392,
1542
+ "learning_rate": 3.164383561643836e-06,
1543
+ "loss": 0.1416,
1544
+ "mean_token_accuracy": 0.9619269706308842,
1545
+ "num_tokens": 1203442.0,
1546
+ "step": 1500
1547
+ },
1548
+ {
1549
+ "entropy": 0.18143940633162856,
1550
+ "epoch": 3.448,
1551
+ "grad_norm": 0.6271806955337524,
1552
+ "learning_rate": 3.1187214611872147e-06,
1553
+ "loss": 0.1692,
1554
+ "mean_token_accuracy": 0.9546364113688469,
1555
+ "num_tokens": 1231043.0,
1556
+ "step": 1510
1557
+ },
1558
+ {
1559
+ "entropy": 0.12511544737499208,
1560
+ "epoch": 3.4708571428571426,
1561
+ "grad_norm": 0.22346270084381104,
1562
+ "learning_rate": 3.0730593607305937e-06,
1563
+ "loss": 0.1142,
1564
+ "mean_token_accuracy": 0.9701676283031702,
1565
+ "num_tokens": 1337013.0,
1566
+ "step": 1520
1567
+ },
1568
+ {
1569
+ "entropy": 0.08854632088914513,
1570
+ "epoch": 3.4937142857142858,
1571
+ "grad_norm": 0.3023281693458557,
1572
+ "learning_rate": 3.0273972602739726e-06,
1573
+ "loss": 0.0809,
1574
+ "mean_token_accuracy": 0.9785845920443534,
1575
+ "num_tokens": 1425282.0,
1576
+ "step": 1530
1577
+ },
1578
+ {
1579
+ "entropy": 0.12356827890034765,
1580
+ "epoch": 3.5165714285714285,
1581
+ "grad_norm": 0.43480384349823,
1582
+ "learning_rate": 2.9817351598173516e-06,
1583
+ "loss": 0.1108,
1584
+ "mean_token_accuracy": 0.9708164893090725,
1585
+ "num_tokens": 1479071.0,
1586
+ "step": 1540
1587
+ },
1588
+ {
1589
+ "entropy": 0.16047979763243347,
1590
+ "epoch": 3.5394285714285716,
1591
+ "grad_norm": 0.4700470566749573,
1592
+ "learning_rate": 2.9360730593607305e-06,
1593
+ "loss": 0.1487,
1594
+ "mean_token_accuracy": 0.9594798684120178,
1595
+ "num_tokens": 1515968.0,
1596
+ "step": 1550
1597
+ },
1598
+ {
1599
+ "entropy": 0.17729573035612703,
1600
+ "epoch": 3.5622857142857143,
1601
+ "grad_norm": 0.7114794850349426,
1602
+ "learning_rate": 2.8904109589041095e-06,
1603
+ "loss": 0.1656,
1604
+ "mean_token_accuracy": 0.9552927497774363,
1605
+ "num_tokens": 1543101.0,
1606
+ "step": 1560
1607
+ },
1608
+ {
1609
+ "entropy": 0.11861445235554129,
1610
+ "epoch": 3.5851428571428574,
1611
+ "grad_norm": 0.29876643419265747,
1612
+ "learning_rate": 2.8447488584474884e-06,
1613
+ "loss": 0.1053,
1614
+ "mean_token_accuracy": 0.9716573052108288,
1615
+ "num_tokens": 1647093.0,
1616
+ "step": 1570
1617
+ },
1618
+ {
1619
+ "entropy": 0.08495850274339319,
1620
+ "epoch": 3.608,
1621
+ "grad_norm": 0.34683695435523987,
1622
+ "learning_rate": 2.7990867579908678e-06,
1623
+ "loss": 0.0789,
1624
+ "mean_token_accuracy": 0.9797851830720902,
1625
+ "num_tokens": 1730702.0,
1626
+ "step": 1580
1627
+ },
1628
+ {
1629
+ "entropy": 0.12895205311942845,
1630
+ "epoch": 3.630857142857143,
1631
+ "grad_norm": 0.4918679893016815,
1632
+ "learning_rate": 2.7534246575342467e-06,
1633
+ "loss": 0.1215,
1634
+ "mean_token_accuracy": 0.9667810715734959,
1635
+ "num_tokens": 1781823.0,
1636
+ "step": 1590
1637
+ },
1638
+ {
1639
+ "entropy": 0.15560776912607252,
1640
+ "epoch": 3.653714285714286,
1641
+ "grad_norm": 0.5862769484519958,
1642
+ "learning_rate": 2.7077625570776257e-06,
1643
+ "loss": 0.1455,
1644
+ "mean_token_accuracy": 0.9603867087513208,
1645
+ "num_tokens": 1817890.0,
1646
+ "step": 1600
1647
+ },
1648
+ {
1649
+ "entropy": 0.1749238725285977,
1650
+ "epoch": 3.6765714285714286,
1651
+ "grad_norm": 0.8136036992073059,
1652
+ "learning_rate": 2.6621004566210046e-06,
1653
+ "loss": 0.1554,
1654
+ "mean_token_accuracy": 0.9576821938157082,
1655
+ "num_tokens": 1844158.0,
1656
+ "step": 1610
1657
+ },
1658
+ {
1659
+ "entropy": 0.11242687762714923,
1660
+ "epoch": 3.6994285714285713,
1661
+ "grad_norm": 0.2995806634426117,
1662
+ "learning_rate": 2.6164383561643835e-06,
1663
+ "loss": 0.0975,
1664
+ "mean_token_accuracy": 0.9727846592664718,
1665
+ "num_tokens": 1955312.0,
1666
+ "step": 1620
1667
+ },
1668
+ {
1669
+ "entropy": 0.08373252097517252,
1670
+ "epoch": 3.7222857142857144,
1671
+ "grad_norm": 0.3565508723258972,
1672
+ "learning_rate": 2.5707762557077625e-06,
1673
+ "loss": 0.0821,
1674
+ "mean_token_accuracy": 0.9794006440788507,
1675
+ "num_tokens": 2048367.0,
1676
+ "step": 1630
1677
+ },
1678
+ {
1679
+ "entropy": 0.11184893473982811,
1680
+ "epoch": 3.745142857142857,
1681
+ "grad_norm": 0.5002964735031128,
1682
+ "learning_rate": 2.5251141552511414e-06,
1683
+ "loss": 0.1033,
1684
+ "mean_token_accuracy": 0.9729055386036635,
1685
+ "num_tokens": 2107094.0,
1686
+ "step": 1640
1687
+ },
1688
+ {
1689
+ "entropy": 0.1523228184087202,
1690
+ "epoch": 3.768,
1691
+ "grad_norm": 0.5381714701652527,
1692
+ "learning_rate": 2.479452054794521e-06,
1693
+ "loss": 0.14,
1694
+ "mean_token_accuracy": 0.961335464194417,
1695
+ "num_tokens": 2146441.0,
1696
+ "step": 1650
1697
+ },
1698
+ {
1699
+ "entropy": 0.1763300130609423,
1700
+ "epoch": 3.790857142857143,
1701
+ "grad_norm": 0.6653213500976562,
1702
+ "learning_rate": 2.4337899543378997e-06,
1703
+ "loss": 0.1613,
1704
+ "mean_token_accuracy": 0.955818934738636,
1705
+ "num_tokens": 2174324.0,
1706
+ "step": 1660
1707
+ },
1708
+ {
1709
+ "entropy": 0.11155222558882087,
1710
+ "epoch": 3.8137142857142856,
1711
+ "grad_norm": 0.34875810146331787,
1712
+ "learning_rate": 2.3881278538812787e-06,
1713
+ "loss": 0.0996,
1714
+ "mean_token_accuracy": 0.9726988013833762,
1715
+ "num_tokens": 2275421.0,
1716
+ "step": 1670
1717
+ },
1718
+ {
1719
+ "entropy": 0.0848108597798273,
1720
+ "epoch": 3.8365714285714283,
1721
+ "grad_norm": 0.30915647745132446,
1722
+ "learning_rate": 2.3424657534246576e-06,
1723
+ "loss": 0.0791,
1724
+ "mean_token_accuracy": 0.979795042052865,
1725
+ "num_tokens": 2362139.0,
1726
+ "step": 1680
1727
+ },
1728
+ {
1729
+ "entropy": 0.12119822092354297,
1730
+ "epoch": 3.8594285714285714,
1731
+ "grad_norm": 0.49934130907058716,
1732
+ "learning_rate": 2.296803652968037e-06,
1733
+ "loss": 0.1121,
1734
+ "mean_token_accuracy": 0.969853087887168,
1735
+ "num_tokens": 2415046.0,
1736
+ "step": 1690
1737
+ },
1738
+ {
1739
+ "entropy": 0.15342484817374497,
1740
+ "epoch": 3.8822857142857146,
1741
+ "grad_norm": 0.5518330931663513,
1742
+ "learning_rate": 2.251141552511416e-06,
1743
+ "loss": 0.1446,
1744
+ "mean_token_accuracy": 0.9614712443202734,
1745
+ "num_tokens": 2450506.0,
1746
+ "step": 1700
1747
+ },
1748
+ {
1749
+ "entropy": 0.17686844640411437,
1750
+ "epoch": 3.9051428571428572,
1751
+ "grad_norm": 0.7212273478507996,
1752
+ "learning_rate": 2.205479452054795e-06,
1753
+ "loss": 0.1646,
1754
+ "mean_token_accuracy": 0.956280616670847,
1755
+ "num_tokens": 2477052.0,
1756
+ "step": 1710
1757
+ },
1758
+ {
1759
+ "entropy": 0.11632896211231128,
1760
+ "epoch": 3.928,
1761
+ "grad_norm": 0.23661138117313385,
1762
+ "learning_rate": 2.159817351598174e-06,
1763
+ "loss": 0.1019,
1764
+ "mean_token_accuracy": 0.9717216279357672,
1765
+ "num_tokens": 2572680.0,
1766
+ "step": 1720
1767
+ },
1768
+ {
1769
+ "entropy": 0.09558084616437554,
1770
+ "epoch": 3.950857142857143,
1771
+ "grad_norm": 0.3915014863014221,
1772
+ "learning_rate": 2.1141552511415528e-06,
1773
+ "loss": 0.0887,
1774
+ "mean_token_accuracy": 0.9764208119362593,
1775
+ "num_tokens": 2646523.0,
1776
+ "step": 1730
1777
+ },
1778
+ {
1779
+ "entropy": 0.1366618540836498,
1780
+ "epoch": 3.9737142857142858,
1781
+ "grad_norm": 0.514821469783783,
1782
+ "learning_rate": 2.0684931506849317e-06,
1783
+ "loss": 0.1276,
1784
+ "mean_token_accuracy": 0.9655411653220654,
1785
+ "num_tokens": 2687673.0,
1786
+ "step": 1740
1787
+ },
1788
+ {
1789
+ "entropy": 0.1805182583630085,
1790
+ "epoch": 3.9965714285714284,
1791
+ "grad_norm": 0.780587375164032,
1792
+ "learning_rate": 2.0228310502283106e-06,
1793
+ "loss": 0.162,
1794
+ "mean_token_accuracy": 0.9553768526762724,
1795
+ "num_tokens": 2714171.0,
1796
+ "step": 1750
1797
+ },
1798
+ {
1799
+ "epoch": 4.0,
1800
+ "eval_accuracy": 0.0015129150056399597,
1801
+ "eval_entropy": 0.3331289222341707,
1802
+ "eval_loss": 1.1662757396697998,
1803
+ "eval_mean_token_accuracy": 0.8316388031230469,
1804
+ "eval_num_tokens": 2716693.0,
1805
+ "eval_runtime": 692.1518,
1806
+ "eval_samples_per_second": 1.494,
1807
+ "eval_steps_per_second": 0.374,
1808
+ "step": 1752
1809
  }
1810
  ],
1811
  "logging_steps": 10,
 
1825
  "attributes": {}
1826
  }
1827
  },
1828
+ "total_flos": 7.585252497207706e+17,
1829
  "train_batch_size": 1,
1830
  "trial_name": null,
1831
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc1b103633cf7c9962527dcf216e434ddad474edf117eac5e9f686412165c6b7
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f58d4a909c837190e81596a49672875e79fb1248b5b446d03a52e65354d414
3
  size 6353