Training in progress, epoch 9, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2695611744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfd0dbfd5a2e202c41c4474c701ce317f0b353244fedc30fcf9035ad3f0cfb4b
|
| 3 |
size 2695611744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 26261260
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e71f4096ee32c8679f4882ef8cbd3add01377b6277f82c373625a3635016054e
|
| 3 |
size 26261260
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15006
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ada9c6a6e42dd9a052cc6c90ea555e554cf2266902dc66003e3529ef94dd9d78
|
| 3 |
size 15006
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17d282d44e6a12e9ca82ec6b42e7f212659678f5fe8c9a58e24a30599040d3e5
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 10,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -1449,6 +1449,188 @@
|
|
| 1449 |
"eval_samples_per_second": 22.032,
|
| 1450 |
"eval_steps_per_second": 5.508,
|
| 1451 |
"step": 1030
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1452 |
}
|
| 1453 |
],
|
| 1454 |
"logging_steps": 10,
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 9.0,
|
| 5 |
"eval_steps": 10,
|
| 6 |
+
"global_step": 1161,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 1449 |
"eval_samples_per_second": 22.032,
|
| 1450 |
"eval_steps_per_second": 5.508,
|
| 1451 |
"step": 1030
|
| 1452 |
+
},
|
| 1453 |
+
{
|
| 1454 |
+
"epoch": 8.062469497315764,
|
| 1455 |
+
"grad_norm": 306364.5,
|
| 1456 |
+
"learning_rate": 1.8750000000000002e-05,
|
| 1457 |
+
"loss": 458.2917,
|
| 1458 |
+
"step": 1040
|
| 1459 |
+
},
|
| 1460 |
+
{
|
| 1461 |
+
"epoch": 8.062469497315764,
|
| 1462 |
+
"eval_runtime": 19.626,
|
| 1463 |
+
"eval_samples_per_second": 22.012,
|
| 1464 |
+
"eval_steps_per_second": 5.503,
|
| 1465 |
+
"step": 1040
|
| 1466 |
+
},
|
| 1467 |
+
{
|
| 1468 |
+
"epoch": 8.140556368960468,
|
| 1469 |
+
"grad_norm": 287573.96875,
|
| 1470 |
+
"learning_rate": 1.796875e-05,
|
| 1471 |
+
"loss": 459.69,
|
| 1472 |
+
"step": 1050
|
| 1473 |
+
},
|
| 1474 |
+
{
|
| 1475 |
+
"epoch": 8.140556368960468,
|
| 1476 |
+
"eval_runtime": 19.64,
|
| 1477 |
+
"eval_samples_per_second": 21.996,
|
| 1478 |
+
"eval_steps_per_second": 5.499,
|
| 1479 |
+
"step": 1050
|
| 1480 |
+
},
|
| 1481 |
+
{
|
| 1482 |
+
"epoch": 8.218643240605173,
|
| 1483 |
+
"grad_norm": 87142.5625,
|
| 1484 |
+
"learning_rate": 1.71875e-05,
|
| 1485 |
+
"loss": 380.1467,
|
| 1486 |
+
"step": 1060
|
| 1487 |
+
},
|
| 1488 |
+
{
|
| 1489 |
+
"epoch": 8.218643240605173,
|
| 1490 |
+
"eval_runtime": 19.6334,
|
| 1491 |
+
"eval_samples_per_second": 22.003,
|
| 1492 |
+
"eval_steps_per_second": 5.501,
|
| 1493 |
+
"step": 1060
|
| 1494 |
+
},
|
| 1495 |
+
{
|
| 1496 |
+
"epoch": 8.296730112249879,
|
| 1497 |
+
"grad_norm": 301649.96875,
|
| 1498 |
+
"learning_rate": 1.6406250000000002e-05,
|
| 1499 |
+
"loss": 384.5057,
|
| 1500 |
+
"step": 1070
|
| 1501 |
+
},
|
| 1502 |
+
{
|
| 1503 |
+
"epoch": 8.296730112249879,
|
| 1504 |
+
"eval_runtime": 19.6212,
|
| 1505 |
+
"eval_samples_per_second": 22.017,
|
| 1506 |
+
"eval_steps_per_second": 5.504,
|
| 1507 |
+
"step": 1070
|
| 1508 |
+
},
|
| 1509 |
+
{
|
| 1510 |
+
"epoch": 8.374816983894583,
|
| 1511 |
+
"grad_norm": 236951.1875,
|
| 1512 |
+
"learning_rate": 1.5625e-05,
|
| 1513 |
+
"loss": 374.7868,
|
| 1514 |
+
"step": 1080
|
| 1515 |
+
},
|
| 1516 |
+
{
|
| 1517 |
+
"epoch": 8.374816983894583,
|
| 1518 |
+
"eval_runtime": 19.6187,
|
| 1519 |
+
"eval_samples_per_second": 22.02,
|
| 1520 |
+
"eval_steps_per_second": 5.505,
|
| 1521 |
+
"step": 1080
|
| 1522 |
+
},
|
| 1523 |
+
{
|
| 1524 |
+
"epoch": 8.452903855539287,
|
| 1525 |
+
"grad_norm": 76360.2734375,
|
| 1526 |
+
"learning_rate": 1.484375e-05,
|
| 1527 |
+
"loss": 312.2099,
|
| 1528 |
+
"step": 1090
|
| 1529 |
+
},
|
| 1530 |
+
{
|
| 1531 |
+
"epoch": 8.452903855539287,
|
| 1532 |
+
"eval_runtime": 19.6209,
|
| 1533 |
+
"eval_samples_per_second": 22.017,
|
| 1534 |
+
"eval_steps_per_second": 5.504,
|
| 1535 |
+
"step": 1090
|
| 1536 |
+
},
|
| 1537 |
+
{
|
| 1538 |
+
"epoch": 8.530990727183992,
|
| 1539 |
+
"grad_norm": 76876.0859375,
|
| 1540 |
+
"learning_rate": 1.4062500000000001e-05,
|
| 1541 |
+
"loss": 303.1329,
|
| 1542 |
+
"step": 1100
|
| 1543 |
+
},
|
| 1544 |
+
{
|
| 1545 |
+
"epoch": 8.530990727183992,
|
| 1546 |
+
"eval_runtime": 19.6126,
|
| 1547 |
+
"eval_samples_per_second": 22.027,
|
| 1548 |
+
"eval_steps_per_second": 5.507,
|
| 1549 |
+
"step": 1100
|
| 1550 |
+
},
|
| 1551 |
+
{
|
| 1552 |
+
"epoch": 8.609077598828698,
|
| 1553 |
+
"grad_norm": 80524.609375,
|
| 1554 |
+
"learning_rate": 1.3281250000000001e-05,
|
| 1555 |
+
"loss": 336.2521,
|
| 1556 |
+
"step": 1110
|
| 1557 |
+
},
|
| 1558 |
+
{
|
| 1559 |
+
"epoch": 8.609077598828698,
|
| 1560 |
+
"eval_runtime": 19.5907,
|
| 1561 |
+
"eval_samples_per_second": 22.051,
|
| 1562 |
+
"eval_steps_per_second": 5.513,
|
| 1563 |
+
"step": 1110
|
| 1564 |
+
},
|
| 1565 |
+
{
|
| 1566 |
+
"epoch": 8.687164470473402,
|
| 1567 |
+
"grad_norm": 114438.3828125,
|
| 1568 |
+
"learning_rate": 1.25e-05,
|
| 1569 |
+
"loss": 342.3281,
|
| 1570 |
+
"step": 1120
|
| 1571 |
+
},
|
| 1572 |
+
{
|
| 1573 |
+
"epoch": 8.687164470473402,
|
| 1574 |
+
"eval_runtime": 19.614,
|
| 1575 |
+
"eval_samples_per_second": 22.025,
|
| 1576 |
+
"eval_steps_per_second": 5.506,
|
| 1577 |
+
"step": 1120
|
| 1578 |
+
},
|
| 1579 |
+
{
|
| 1580 |
+
"epoch": 8.765251342118106,
|
| 1581 |
+
"grad_norm": 281197.375,
|
| 1582 |
+
"learning_rate": 1.171875e-05,
|
| 1583 |
+
"loss": 358.492,
|
| 1584 |
+
"step": 1130
|
| 1585 |
+
},
|
| 1586 |
+
{
|
| 1587 |
+
"epoch": 8.765251342118106,
|
| 1588 |
+
"eval_runtime": 19.5986,
|
| 1589 |
+
"eval_samples_per_second": 22.042,
|
| 1590 |
+
"eval_steps_per_second": 5.511,
|
| 1591 |
+
"step": 1130
|
| 1592 |
+
},
|
| 1593 |
+
{
|
| 1594 |
+
"epoch": 8.843338213762811,
|
| 1595 |
+
"grad_norm": 39132.8515625,
|
| 1596 |
+
"learning_rate": 1.09375e-05,
|
| 1597 |
+
"loss": 297.0417,
|
| 1598 |
+
"step": 1140
|
| 1599 |
+
},
|
| 1600 |
+
{
|
| 1601 |
+
"epoch": 8.843338213762811,
|
| 1602 |
+
"eval_runtime": 19.6013,
|
| 1603 |
+
"eval_samples_per_second": 22.039,
|
| 1604 |
+
"eval_steps_per_second": 5.51,
|
| 1605 |
+
"step": 1140
|
| 1606 |
+
},
|
| 1607 |
+
{
|
| 1608 |
+
"epoch": 8.921425085407517,
|
| 1609 |
+
"grad_norm": 270795.65625,
|
| 1610 |
+
"learning_rate": 1.0156250000000001e-05,
|
| 1611 |
+
"loss": 306.2402,
|
| 1612 |
+
"step": 1150
|
| 1613 |
+
},
|
| 1614 |
+
{
|
| 1615 |
+
"epoch": 8.921425085407517,
|
| 1616 |
+
"eval_runtime": 19.6264,
|
| 1617 |
+
"eval_samples_per_second": 22.011,
|
| 1618 |
+
"eval_steps_per_second": 5.503,
|
| 1619 |
+
"step": 1150
|
| 1620 |
+
},
|
| 1621 |
+
{
|
| 1622 |
+
"epoch": 8.99951195705222,
|
| 1623 |
+
"grad_norm": 124614.0390625,
|
| 1624 |
+
"learning_rate": 9.375000000000001e-06,
|
| 1625 |
+
"loss": 354.61,
|
| 1626 |
+
"step": 1160
|
| 1627 |
+
},
|
| 1628 |
+
{
|
| 1629 |
+
"epoch": 8.99951195705222,
|
| 1630 |
+
"eval_runtime": 19.6258,
|
| 1631 |
+
"eval_samples_per_second": 22.012,
|
| 1632 |
+
"eval_steps_per_second": 5.503,
|
| 1633 |
+
"step": 1160
|
| 1634 |
}
|
| 1635 |
],
|
| 1636 |
"logging_steps": 10,
|