Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 2000, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9446744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdf2871a23de26395412fbb80cd5cfc6261483030011b659b66248a001490ba5
|
| 3 |
size 9446744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4879947
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13d0d3ac532ad9924ef2b3bb9206e041a19d9bb2aae0a0f9b0e9fb94268b3e2f
|
| 3 |
size 4879947
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96666620a506272b19319944e27b166707266143df40b9e008c7e67e99eb3d33
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad3ae1599c24410db8dc749055bc50d225b3704ca4ce296c6043ed130093cd3d
|
| 3 |
size 14917
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4aa03f6e0cd07cf67ce1fbe3101d545f5771ef9148b9debf02b11cf6948da5c
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa56fa8fa334bce407f019356c2a989207ab5f10b19e9753e7cbc5ea11bcd4ec
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1508,6 +1508,506 @@
|
|
| 1508 |
"mean_token_accuracy": 0.6264939974993468,
|
| 1509 |
"num_tokens": 8909416.0,
|
| 1510 |
"step": 1500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1511 |
}
|
| 1512 |
],
|
| 1513 |
"logging_steps": 10,
|
|
@@ -1527,7 +2027,7 @@
|
|
| 1527 |
"attributes": {}
|
| 1528 |
}
|
| 1529 |
},
|
| 1530 |
-
"total_flos":
|
| 1531 |
"train_batch_size": 2,
|
| 1532 |
"trial_name": null,
|
| 1533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.436707500537288,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 2000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1508 |
"mean_token_accuracy": 0.6264939974993468,
|
| 1509 |
"num_tokens": 8909416.0,
|
| 1510 |
"step": 1500
|
| 1511 |
+
},
|
| 1512 |
+
{
|
| 1513 |
+
"entropy": 1.8110749498009682,
|
| 1514 |
+
"epoch": 2.5948850204169354,
|
| 1515 |
+
"grad_norm": 0.7108538746833801,
|
| 1516 |
+
"learning_rate": 0.00013964,
|
| 1517 |
+
"loss": 1.8952640533447265,
|
| 1518 |
+
"mean_token_accuracy": 0.6537120632827282,
|
| 1519 |
+
"num_tokens": 8968510.0,
|
| 1520 |
+
"step": 1510
|
| 1521 |
+
},
|
| 1522 |
+
{
|
| 1523 |
+
"entropy": 1.977073846757412,
|
| 1524 |
+
"epoch": 2.612078229099506,
|
| 1525 |
+
"grad_norm": 0.7554802298545837,
|
| 1526 |
+
"learning_rate": 0.00013924,
|
| 1527 |
+
"loss": 2.0621898651123045,
|
| 1528 |
+
"mean_token_accuracy": 0.6327366977930069,
|
| 1529 |
+
"num_tokens": 9026884.0,
|
| 1530 |
+
"step": 1520
|
| 1531 |
+
},
|
| 1532 |
+
{
|
| 1533 |
+
"entropy": 1.8783492282032968,
|
| 1534 |
+
"epoch": 2.629271437782076,
|
| 1535 |
+
"grad_norm": 0.6592015027999878,
|
| 1536 |
+
"learning_rate": 0.00013884000000000002,
|
| 1537 |
+
"loss": 1.9230785369873047,
|
| 1538 |
+
"mean_token_accuracy": 0.6494536675512791,
|
| 1539 |
+
"num_tokens": 9085571.0,
|
| 1540 |
+
"step": 1530
|
| 1541 |
+
},
|
| 1542 |
+
{
|
| 1543 |
+
"entropy": 1.9282778173685073,
|
| 1544 |
+
"epoch": 2.6464646464646466,
|
| 1545 |
+
"grad_norm": 0.7717080116271973,
|
| 1546 |
+
"learning_rate": 0.00013844,
|
| 1547 |
+
"loss": 2.0319377899169924,
|
| 1548 |
+
"mean_token_accuracy": 0.6344667036086321,
|
| 1549 |
+
"num_tokens": 9147549.0,
|
| 1550 |
+
"step": 1540
|
| 1551 |
+
},
|
| 1552 |
+
{
|
| 1553 |
+
"entropy": 1.903467869758606,
|
| 1554 |
+
"epoch": 2.6636578551472168,
|
| 1555 |
+
"grad_norm": 0.6227516531944275,
|
| 1556 |
+
"learning_rate": 0.00013804000000000003,
|
| 1557 |
+
"loss": 1.9306724548339844,
|
| 1558 |
+
"mean_token_accuracy": 0.644033481925726,
|
| 1559 |
+
"num_tokens": 9204942.0,
|
| 1560 |
+
"step": 1550
|
| 1561 |
+
},
|
| 1562 |
+
{
|
| 1563 |
+
"entropy": 1.8967040538787843,
|
| 1564 |
+
"epoch": 2.6808510638297873,
|
| 1565 |
+
"grad_norm": 0.6684938073158264,
|
| 1566 |
+
"learning_rate": 0.00013764000000000002,
|
| 1567 |
+
"loss": 2.001560592651367,
|
| 1568 |
+
"mean_token_accuracy": 0.6470274899154902,
|
| 1569 |
+
"num_tokens": 9266446.0,
|
| 1570 |
+
"step": 1560
|
| 1571 |
+
},
|
| 1572 |
+
{
|
| 1573 |
+
"entropy": 1.8590586185455322,
|
| 1574 |
+
"epoch": 2.6980442725123575,
|
| 1575 |
+
"grad_norm": 0.6150694489479065,
|
| 1576 |
+
"learning_rate": 0.00013724,
|
| 1577 |
+
"loss": 1.9280338287353516,
|
| 1578 |
+
"mean_token_accuracy": 0.6484670951962471,
|
| 1579 |
+
"num_tokens": 9326109.0,
|
| 1580 |
+
"step": 1570
|
| 1581 |
+
},
|
| 1582 |
+
{
|
| 1583 |
+
"entropy": 1.9293041676282883,
|
| 1584 |
+
"epoch": 2.715237481194928,
|
| 1585 |
+
"grad_norm": 0.6057704091072083,
|
| 1586 |
+
"learning_rate": 0.00013684000000000002,
|
| 1587 |
+
"loss": 1.9943519592285157,
|
| 1588 |
+
"mean_token_accuracy": 0.6371258046478033,
|
| 1589 |
+
"num_tokens": 9385073.0,
|
| 1590 |
+
"step": 1580
|
| 1591 |
+
},
|
| 1592 |
+
{
|
| 1593 |
+
"entropy": 1.8843669161200522,
|
| 1594 |
+
"epoch": 2.732430689877498,
|
| 1595 |
+
"grad_norm": 0.6834639310836792,
|
| 1596 |
+
"learning_rate": 0.00013644000000000002,
|
| 1597 |
+
"loss": 1.9569879531860352,
|
| 1598 |
+
"mean_token_accuracy": 0.6437417894601822,
|
| 1599 |
+
"num_tokens": 9445137.0,
|
| 1600 |
+
"step": 1590
|
| 1601 |
+
},
|
| 1602 |
+
{
|
| 1603 |
+
"entropy": 1.8529930964112282,
|
| 1604 |
+
"epoch": 2.7496238985600687,
|
| 1605 |
+
"grad_norm": 0.6442180871963501,
|
| 1606 |
+
"learning_rate": 0.00013604,
|
| 1607 |
+
"loss": 1.8902450561523438,
|
| 1608 |
+
"mean_token_accuracy": 0.6518216013908387,
|
| 1609 |
+
"num_tokens": 9504160.0,
|
| 1610 |
+
"step": 1600
|
| 1611 |
+
},
|
| 1612 |
+
{
|
| 1613 |
+
"entropy": 1.939158782362938,
|
| 1614 |
+
"epoch": 2.7668171072426393,
|
| 1615 |
+
"grad_norm": 0.6240729689598083,
|
| 1616 |
+
"learning_rate": 0.00013564000000000002,
|
| 1617 |
+
"loss": 2.0188575744628907,
|
| 1618 |
+
"mean_token_accuracy": 0.63564417026937,
|
| 1619 |
+
"num_tokens": 9564675.0,
|
| 1620 |
+
"step": 1610
|
| 1621 |
+
},
|
| 1622 |
+
{
|
| 1623 |
+
"entropy": 1.9281259045004844,
|
| 1624 |
+
"epoch": 2.7840103159252094,
|
| 1625 |
+
"grad_norm": 0.750890851020813,
|
| 1626 |
+
"learning_rate": 0.00013524,
|
| 1627 |
+
"loss": 2.017038345336914,
|
| 1628 |
+
"mean_token_accuracy": 0.6387452960014344,
|
| 1629 |
+
"num_tokens": 9625026.0,
|
| 1630 |
+
"step": 1620
|
| 1631 |
+
},
|
| 1632 |
+
{
|
| 1633 |
+
"entropy": 1.873080413043499,
|
| 1634 |
+
"epoch": 2.80120352460778,
|
| 1635 |
+
"grad_norm": 0.776397168636322,
|
| 1636 |
+
"learning_rate": 0.00013484,
|
| 1637 |
+
"loss": 1.9759422302246095,
|
| 1638 |
+
"mean_token_accuracy": 0.6433901283890009,
|
| 1639 |
+
"num_tokens": 9685967.0,
|
| 1640 |
+
"step": 1630
|
| 1641 |
+
},
|
| 1642 |
+
{
|
| 1643 |
+
"entropy": 1.9089648619294166,
|
| 1644 |
+
"epoch": 2.81839673329035,
|
| 1645 |
+
"grad_norm": 0.6481618881225586,
|
| 1646 |
+
"learning_rate": 0.00013444000000000002,
|
| 1647 |
+
"loss": 1.956050491333008,
|
| 1648 |
+
"mean_token_accuracy": 0.6402542922645807,
|
| 1649 |
+
"num_tokens": 9745233.0,
|
| 1650 |
+
"step": 1640
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"entropy": 1.975960558652878,
|
| 1654 |
+
"epoch": 2.8355899419729207,
|
| 1655 |
+
"grad_norm": 0.6896694302558899,
|
| 1656 |
+
"learning_rate": 0.00013404,
|
| 1657 |
+
"loss": 2.0583721160888673,
|
| 1658 |
+
"mean_token_accuracy": 0.6340504981577396,
|
| 1659 |
+
"num_tokens": 9805150.0,
|
| 1660 |
+
"step": 1650
|
| 1661 |
+
},
|
| 1662 |
+
{
|
| 1663 |
+
"entropy": 1.945571132004261,
|
| 1664 |
+
"epoch": 2.8527831506554913,
|
| 1665 |
+
"grad_norm": 0.6386220455169678,
|
| 1666 |
+
"learning_rate": 0.00013364,
|
| 1667 |
+
"loss": 2.03116512298584,
|
| 1668 |
+
"mean_token_accuracy": 0.6365220382809639,
|
| 1669 |
+
"num_tokens": 9861196.0,
|
| 1670 |
+
"step": 1660
|
| 1671 |
+
},
|
| 1672 |
+
{
|
| 1673 |
+
"entropy": 1.9110410138964653,
|
| 1674 |
+
"epoch": 2.8699763593380614,
|
| 1675 |
+
"grad_norm": 0.7503199577331543,
|
| 1676 |
+
"learning_rate": 0.00013324000000000002,
|
| 1677 |
+
"loss": 1.9521196365356446,
|
| 1678 |
+
"mean_token_accuracy": 0.6381696432828903,
|
| 1679 |
+
"num_tokens": 9921155.0,
|
| 1680 |
+
"step": 1670
|
| 1681 |
+
},
|
| 1682 |
+
{
|
| 1683 |
+
"entropy": 1.849820225685835,
|
| 1684 |
+
"epoch": 2.887169568020632,
|
| 1685 |
+
"grad_norm": 0.6197855472564697,
|
| 1686 |
+
"learning_rate": 0.00013284,
|
| 1687 |
+
"loss": 1.8909440994262696,
|
| 1688 |
+
"mean_token_accuracy": 0.6426266122609376,
|
| 1689 |
+
"num_tokens": 9979351.0,
|
| 1690 |
+
"step": 1680
|
| 1691 |
+
},
|
| 1692 |
+
{
|
| 1693 |
+
"entropy": 1.8932805389165879,
|
| 1694 |
+
"epoch": 2.904362776703202,
|
| 1695 |
+
"grad_norm": 0.6703120470046997,
|
| 1696 |
+
"learning_rate": 0.00013244,
|
| 1697 |
+
"loss": 2.0233718872070314,
|
| 1698 |
+
"mean_token_accuracy": 0.646468547359109,
|
| 1699 |
+
"num_tokens": 10041238.0,
|
| 1700 |
+
"step": 1690
|
| 1701 |
+
},
|
| 1702 |
+
{
|
| 1703 |
+
"entropy": 1.8625088930130005,
|
| 1704 |
+
"epoch": 2.9215559853857727,
|
| 1705 |
+
"grad_norm": 0.73073410987854,
|
| 1706 |
+
"learning_rate": 0.00013204000000000002,
|
| 1707 |
+
"loss": 1.9317462921142579,
|
| 1708 |
+
"mean_token_accuracy": 0.6454428397119045,
|
| 1709 |
+
"num_tokens": 10099496.0,
|
| 1710 |
+
"step": 1700
|
| 1711 |
+
},
|
| 1712 |
+
{
|
| 1713 |
+
"entropy": 1.9354272544384004,
|
| 1714 |
+
"epoch": 2.9387491940683432,
|
| 1715 |
+
"grad_norm": 0.6566579937934875,
|
| 1716 |
+
"learning_rate": 0.00013164,
|
| 1717 |
+
"loss": 2.0027164459228515,
|
| 1718 |
+
"mean_token_accuracy": 0.6403926335275173,
|
| 1719 |
+
"num_tokens": 10161720.0,
|
| 1720 |
+
"step": 1710
|
| 1721 |
+
},
|
| 1722 |
+
{
|
| 1723 |
+
"entropy": 1.88578300178051,
|
| 1724 |
+
"epoch": 2.9559424027509134,
|
| 1725 |
+
"grad_norm": 0.7905747890472412,
|
| 1726 |
+
"learning_rate": 0.00013124,
|
| 1727 |
+
"loss": 1.9767372131347656,
|
| 1728 |
+
"mean_token_accuracy": 0.6421503167599439,
|
| 1729 |
+
"num_tokens": 10221734.0,
|
| 1730 |
+
"step": 1720
|
| 1731 |
+
},
|
| 1732 |
+
{
|
| 1733 |
+
"entropy": 1.870301403105259,
|
| 1734 |
+
"epoch": 2.973135611433484,
|
| 1735 |
+
"grad_norm": 0.7210419774055481,
|
| 1736 |
+
"learning_rate": 0.00013084000000000001,
|
| 1737 |
+
"loss": 1.9475433349609375,
|
| 1738 |
+
"mean_token_accuracy": 0.6528905872255564,
|
| 1739 |
+
"num_tokens": 10280223.0,
|
| 1740 |
+
"step": 1730
|
| 1741 |
+
},
|
| 1742 |
+
{
|
| 1743 |
+
"entropy": 1.8696911588311196,
|
| 1744 |
+
"epoch": 2.990328820116054,
|
| 1745 |
+
"grad_norm": 0.626354992389679,
|
| 1746 |
+
"learning_rate": 0.00013044,
|
| 1747 |
+
"loss": 1.926706314086914,
|
| 1748 |
+
"mean_token_accuracy": 0.6482070714235306,
|
| 1749 |
+
"num_tokens": 10339813.0,
|
| 1750 |
+
"step": 1740
|
| 1751 |
+
},
|
| 1752 |
+
{
|
| 1753 |
+
"entropy": 1.821205088844547,
|
| 1754 |
+
"epoch": 3.006877283473028,
|
| 1755 |
+
"grad_norm": 0.6353569030761719,
|
| 1756 |
+
"learning_rate": 0.00013004,
|
| 1757 |
+
"loss": 1.8657075881958007,
|
| 1758 |
+
"mean_token_accuracy": 0.6556356762136731,
|
| 1759 |
+
"num_tokens": 10398519.0,
|
| 1760 |
+
"step": 1750
|
| 1761 |
+
},
|
| 1762 |
+
{
|
| 1763 |
+
"entropy": 1.8890676617622375,
|
| 1764 |
+
"epoch": 3.0240704921555985,
|
| 1765 |
+
"grad_norm": 0.783729076385498,
|
| 1766 |
+
"learning_rate": 0.00012964,
|
| 1767 |
+
"loss": 1.9794137954711915,
|
| 1768 |
+
"mean_token_accuracy": 0.643126554042101,
|
| 1769 |
+
"num_tokens": 10456386.0,
|
| 1770 |
+
"step": 1760
|
| 1771 |
+
},
|
| 1772 |
+
{
|
| 1773 |
+
"entropy": 1.8766882956027984,
|
| 1774 |
+
"epoch": 3.041263700838169,
|
| 1775 |
+
"grad_norm": 0.7075045108795166,
|
| 1776 |
+
"learning_rate": 0.00012924,
|
| 1777 |
+
"loss": 1.9388771057128906,
|
| 1778 |
+
"mean_token_accuracy": 0.6462941512465477,
|
| 1779 |
+
"num_tokens": 10516721.0,
|
| 1780 |
+
"step": 1770
|
| 1781 |
+
},
|
| 1782 |
+
{
|
| 1783 |
+
"entropy": 1.7985384911298752,
|
| 1784 |
+
"epoch": 3.0584569095207392,
|
| 1785 |
+
"grad_norm": 0.7116262912750244,
|
| 1786 |
+
"learning_rate": 0.00012884,
|
| 1787 |
+
"loss": 1.8379974365234375,
|
| 1788 |
+
"mean_token_accuracy": 0.6582404263317585,
|
| 1789 |
+
"num_tokens": 10575553.0,
|
| 1790 |
+
"step": 1780
|
| 1791 |
+
},
|
| 1792 |
+
{
|
| 1793 |
+
"entropy": 1.8475583091378212,
|
| 1794 |
+
"epoch": 3.07565011820331,
|
| 1795 |
+
"grad_norm": 0.69736248254776,
|
| 1796 |
+
"learning_rate": 0.00012844,
|
| 1797 |
+
"loss": 1.9197765350341798,
|
| 1798 |
+
"mean_token_accuracy": 0.6509403776377439,
|
| 1799 |
+
"num_tokens": 10632501.0,
|
| 1800 |
+
"step": 1790
|
| 1801 |
+
},
|
| 1802 |
+
{
|
| 1803 |
+
"entropy": 1.8264927819371224,
|
| 1804 |
+
"epoch": 3.09284332688588,
|
| 1805 |
+
"grad_norm": 0.6354222297668457,
|
| 1806 |
+
"learning_rate": 0.00012804,
|
| 1807 |
+
"loss": 1.8965986251831055,
|
| 1808 |
+
"mean_token_accuracy": 0.6518782209604979,
|
| 1809 |
+
"num_tokens": 10693167.0,
|
| 1810 |
+
"step": 1800
|
| 1811 |
+
},
|
| 1812 |
+
{
|
| 1813 |
+
"entropy": 1.8696907818317414,
|
| 1814 |
+
"epoch": 3.1100365355684505,
|
| 1815 |
+
"grad_norm": 0.7568804621696472,
|
| 1816 |
+
"learning_rate": 0.00012764,
|
| 1817 |
+
"loss": 1.9332853317260743,
|
| 1818 |
+
"mean_token_accuracy": 0.6471077598631382,
|
| 1819 |
+
"num_tokens": 10753837.0,
|
| 1820 |
+
"step": 1810
|
| 1821 |
+
},
|
| 1822 |
+
{
|
| 1823 |
+
"entropy": 1.886954003572464,
|
| 1824 |
+
"epoch": 3.1272297442510206,
|
| 1825 |
+
"grad_norm": 0.7069846391677856,
|
| 1826 |
+
"learning_rate": 0.00012724,
|
| 1827 |
+
"loss": 1.9263908386230468,
|
| 1828 |
+
"mean_token_accuracy": 0.6466126769781113,
|
| 1829 |
+
"num_tokens": 10815256.0,
|
| 1830 |
+
"step": 1820
|
| 1831 |
+
},
|
| 1832 |
+
{
|
| 1833 |
+
"entropy": 1.8424360305070877,
|
| 1834 |
+
"epoch": 3.144422952933591,
|
| 1835 |
+
"grad_norm": 0.6524083614349365,
|
| 1836 |
+
"learning_rate": 0.00012684,
|
| 1837 |
+
"loss": 1.9088315963745117,
|
| 1838 |
+
"mean_token_accuracy": 0.6496367674320936,
|
| 1839 |
+
"num_tokens": 10877848.0,
|
| 1840 |
+
"step": 1830
|
| 1841 |
+
},
|
| 1842 |
+
{
|
| 1843 |
+
"entropy": 1.8966794192790986,
|
| 1844 |
+
"epoch": 3.1616161616161618,
|
| 1845 |
+
"grad_norm": 0.687421977519989,
|
| 1846 |
+
"learning_rate": 0.00012644000000000002,
|
| 1847 |
+
"loss": 1.9748069763183593,
|
| 1848 |
+
"mean_token_accuracy": 0.6424707356840372,
|
| 1849 |
+
"num_tokens": 10938042.0,
|
| 1850 |
+
"step": 1840
|
| 1851 |
+
},
|
| 1852 |
+
{
|
| 1853 |
+
"entropy": 1.81406429708004,
|
| 1854 |
+
"epoch": 3.178809370298732,
|
| 1855 |
+
"grad_norm": 0.7668496370315552,
|
| 1856 |
+
"learning_rate": 0.00012604,
|
| 1857 |
+
"loss": 1.8712465286254882,
|
| 1858 |
+
"mean_token_accuracy": 0.6571074567735196,
|
| 1859 |
+
"num_tokens": 10996204.0,
|
| 1860 |
+
"step": 1850
|
| 1861 |
+
},
|
| 1862 |
+
{
|
| 1863 |
+
"entropy": 1.8159340515732765,
|
| 1864 |
+
"epoch": 3.1960025789813025,
|
| 1865 |
+
"grad_norm": 0.7182545065879822,
|
| 1866 |
+
"learning_rate": 0.00012564,
|
| 1867 |
+
"loss": 1.830276107788086,
|
| 1868 |
+
"mean_token_accuracy": 0.6546356856822968,
|
| 1869 |
+
"num_tokens": 11056605.0,
|
| 1870 |
+
"step": 1860
|
| 1871 |
+
},
|
| 1872 |
+
{
|
| 1873 |
+
"entropy": 1.9095668271183968,
|
| 1874 |
+
"epoch": 3.2131957876638726,
|
| 1875 |
+
"grad_norm": 0.7548812031745911,
|
| 1876 |
+
"learning_rate": 0.00012524000000000001,
|
| 1877 |
+
"loss": 1.998922348022461,
|
| 1878 |
+
"mean_token_accuracy": 0.6411306612193585,
|
| 1879 |
+
"num_tokens": 11116614.0,
|
| 1880 |
+
"step": 1870
|
| 1881 |
+
},
|
| 1882 |
+
{
|
| 1883 |
+
"entropy": 1.8717206478118897,
|
| 1884 |
+
"epoch": 3.230388996346443,
|
| 1885 |
+
"grad_norm": 0.7692223191261292,
|
| 1886 |
+
"learning_rate": 0.00012484,
|
| 1887 |
+
"loss": 1.914438247680664,
|
| 1888 |
+
"mean_token_accuracy": 0.6441164951771498,
|
| 1889 |
+
"num_tokens": 11175802.0,
|
| 1890 |
+
"step": 1880
|
| 1891 |
+
},
|
| 1892 |
+
{
|
| 1893 |
+
"entropy": 1.8943733513355254,
|
| 1894 |
+
"epoch": 3.2475822050290137,
|
| 1895 |
+
"grad_norm": 0.6439138650894165,
|
| 1896 |
+
"learning_rate": 0.00012444,
|
| 1897 |
+
"loss": 1.9280553817749024,
|
| 1898 |
+
"mean_token_accuracy": 0.6476396139711141,
|
| 1899 |
+
"num_tokens": 11236477.0,
|
| 1900 |
+
"step": 1890
|
| 1901 |
+
},
|
| 1902 |
+
{
|
| 1903 |
+
"entropy": 1.8841392308473588,
|
| 1904 |
+
"epoch": 3.264775413711584,
|
| 1905 |
+
"grad_norm": 0.6971343159675598,
|
| 1906 |
+
"learning_rate": 0.00012404,
|
| 1907 |
+
"loss": 1.942568588256836,
|
| 1908 |
+
"mean_token_accuracy": 0.6398356795310974,
|
| 1909 |
+
"num_tokens": 11295146.0,
|
| 1910 |
+
"step": 1900
|
| 1911 |
+
},
|
| 1912 |
+
{
|
| 1913 |
+
"entropy": 1.8830088019371032,
|
| 1914 |
+
"epoch": 3.2819686223941544,
|
| 1915 |
+
"grad_norm": 0.7196023464202881,
|
| 1916 |
+
"learning_rate": 0.00012364,
|
| 1917 |
+
"loss": 1.963007354736328,
|
| 1918 |
+
"mean_token_accuracy": 0.6452915534377098,
|
| 1919 |
+
"num_tokens": 11355726.0,
|
| 1920 |
+
"step": 1910
|
| 1921 |
+
},
|
| 1922 |
+
{
|
| 1923 |
+
"entropy": 1.927216087281704,
|
| 1924 |
+
"epoch": 3.2991618310767246,
|
| 1925 |
+
"grad_norm": 0.790634274482727,
|
| 1926 |
+
"learning_rate": 0.00012324,
|
| 1927 |
+
"loss": 2.0809165954589846,
|
| 1928 |
+
"mean_token_accuracy": 0.6384686015546321,
|
| 1929 |
+
"num_tokens": 11415237.0,
|
| 1930 |
+
"step": 1920
|
| 1931 |
+
},
|
| 1932 |
+
{
|
| 1933 |
+
"entropy": 1.849087017774582,
|
| 1934 |
+
"epoch": 3.316355039759295,
|
| 1935 |
+
"grad_norm": 0.6752087473869324,
|
| 1936 |
+
"learning_rate": 0.00012284,
|
| 1937 |
+
"loss": 1.9017595291137694,
|
| 1938 |
+
"mean_token_accuracy": 0.6522149413824081,
|
| 1939 |
+
"num_tokens": 11476337.0,
|
| 1940 |
+
"step": 1930
|
| 1941 |
+
},
|
| 1942 |
+
{
|
| 1943 |
+
"entropy": 1.8517325416207313,
|
| 1944 |
+
"epoch": 3.3335482484418657,
|
| 1945 |
+
"grad_norm": 0.8036973476409912,
|
| 1946 |
+
"learning_rate": 0.00012244,
|
| 1947 |
+
"loss": 1.9011222839355468,
|
| 1948 |
+
"mean_token_accuracy": 0.6499856971204281,
|
| 1949 |
+
"num_tokens": 11537529.0,
|
| 1950 |
+
"step": 1940
|
| 1951 |
+
},
|
| 1952 |
+
{
|
| 1953 |
+
"entropy": 1.7622334837913514,
|
| 1954 |
+
"epoch": 3.350741457124436,
|
| 1955 |
+
"grad_norm": 0.7138587832450867,
|
| 1956 |
+
"learning_rate": 0.00012204,
|
| 1957 |
+
"loss": 1.7955827713012695,
|
| 1958 |
+
"mean_token_accuracy": 0.6596556272357702,
|
| 1959 |
+
"num_tokens": 11595421.0,
|
| 1960 |
+
"step": 1950
|
| 1961 |
+
},
|
| 1962 |
+
{
|
| 1963 |
+
"entropy": 1.8950866341590882,
|
| 1964 |
+
"epoch": 3.3679346658070064,
|
| 1965 |
+
"grad_norm": 0.6869714260101318,
|
| 1966 |
+
"learning_rate": 0.00012164,
|
| 1967 |
+
"loss": 1.948552131652832,
|
| 1968 |
+
"mean_token_accuracy": 0.6493024453520775,
|
| 1969 |
+
"num_tokens": 11655749.0,
|
| 1970 |
+
"step": 1960
|
| 1971 |
+
},
|
| 1972 |
+
{
|
| 1973 |
+
"entropy": 1.9235218942165375,
|
| 1974 |
+
"epoch": 3.3851278744895765,
|
| 1975 |
+
"grad_norm": 0.656403124332428,
|
| 1976 |
+
"learning_rate": 0.00012124,
|
| 1977 |
+
"loss": 2.04327449798584,
|
| 1978 |
+
"mean_token_accuracy": 0.6389912366867065,
|
| 1979 |
+
"num_tokens": 11717271.0,
|
| 1980 |
+
"step": 1970
|
| 1981 |
+
},
|
| 1982 |
+
{
|
| 1983 |
+
"entropy": 1.834906594455242,
|
| 1984 |
+
"epoch": 3.402321083172147,
|
| 1985 |
+
"grad_norm": 0.7343699932098389,
|
| 1986 |
+
"learning_rate": 0.00012084,
|
| 1987 |
+
"loss": 1.9038848876953125,
|
| 1988 |
+
"mean_token_accuracy": 0.6569048661738635,
|
| 1989 |
+
"num_tokens": 11778095.0,
|
| 1990 |
+
"step": 1980
|
| 1991 |
+
},
|
| 1992 |
+
{
|
| 1993 |
+
"entropy": 1.8515655741095542,
|
| 1994 |
+
"epoch": 3.4195142918547172,
|
| 1995 |
+
"grad_norm": 0.7009745240211487,
|
| 1996 |
+
"learning_rate": 0.00012043999999999999,
|
| 1997 |
+
"loss": 1.9157728195190429,
|
| 1998 |
+
"mean_token_accuracy": 0.6512683361768723,
|
| 1999 |
+
"num_tokens": 11835954.0,
|
| 2000 |
+
"step": 1990
|
| 2001 |
+
},
|
| 2002 |
+
{
|
| 2003 |
+
"entropy": 1.8634012743830681,
|
| 2004 |
+
"epoch": 3.436707500537288,
|
| 2005 |
+
"grad_norm": 0.6880552172660828,
|
| 2006 |
+
"learning_rate": 0.00012004,
|
| 2007 |
+
"loss": 1.9772762298583983,
|
| 2008 |
+
"mean_token_accuracy": 0.6531724959611893,
|
| 2009 |
+
"num_tokens": 11896615.0,
|
| 2010 |
+
"step": 2000
|
| 2011 |
}
|
| 2012 |
],
|
| 2013 |
"logging_steps": 10,
|
|
|
|
| 2027 |
"attributes": {}
|
| 2028 |
}
|
| 2029 |
},
|
| 2030 |
+
"total_flos": 9.772738986953933e+16,
|
| 2031 |
"train_batch_size": 2,
|
| 2032 |
"trial_name": null,
|
| 2033 |
"trial_params": null
|