Training in progress, step 208, checkpoint
Browse files
last-checkpoint/model-00001-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4969539560
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50a7626c3332382c720b25d7028428e6e693206a85b1d278123f350e6447c549
|
| 3 |
size 4969539560
|
last-checkpoint/model-00002-of-00002.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1912795688
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:674ac2c674747082527a37e7013363c3374ff004d5b78edf91c3585792370cd4
|
| 3 |
size 1912795688
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 16,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1328,6 +1328,246 @@
|
|
| 1328 |
"eval_samples_per_second": 16.828,
|
| 1329 |
"eval_steps_per_second": 16.828,
|
| 1330 |
"step": 176
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1331 |
}
|
| 1332 |
],
|
| 1333 |
"logging_steps": 1,
|
|
@@ -1347,7 +1587,7 @@
|
|
| 1347 |
"attributes": {}
|
| 1348 |
}
|
| 1349 |
},
|
| 1350 |
-
"total_flos":
|
| 1351 |
"train_batch_size": 1,
|
| 1352 |
"trial_name": null,
|
| 1353 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 52.0,
|
| 6 |
"eval_steps": 16,
|
| 7 |
+
"global_step": 208,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1328 |
"eval_samples_per_second": 16.828,
|
| 1329 |
"eval_steps_per_second": 16.828,
|
| 1330 |
"step": 176
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 44.29090909090909,
|
| 1334 |
+
"grad_norm": 6.314858436584473,
|
| 1335 |
+
"learning_rate": 7.185729670371604e-08,
|
| 1336 |
+
"loss": 0.7001,
|
| 1337 |
+
"step": 177
|
| 1338 |
+
},
|
| 1339 |
+
{
|
| 1340 |
+
"epoch": 44.58181818181818,
|
| 1341 |
+
"grad_norm": 6.364148139953613,
|
| 1342 |
+
"learning_rate": 7.149255520259337e-08,
|
| 1343 |
+
"loss": 0.786,
|
| 1344 |
+
"step": 178
|
| 1345 |
+
},
|
| 1346 |
+
{
|
| 1347 |
+
"epoch": 44.872727272727275,
|
| 1348 |
+
"grad_norm": 5.679451942443848,
|
| 1349 |
+
"learning_rate": 7.11265577295385e-08,
|
| 1350 |
+
"loss": 0.6767,
|
| 1351 |
+
"step": 179
|
| 1352 |
+
},
|
| 1353 |
+
{
|
| 1354 |
+
"epoch": 45.0,
|
| 1355 |
+
"grad_norm": 6.4454216957092285,
|
| 1356 |
+
"learning_rate": 7.075933215667603e-08,
|
| 1357 |
+
"loss": 0.7351,
|
| 1358 |
+
"step": 180
|
| 1359 |
+
},
|
| 1360 |
+
{
|
| 1361 |
+
"epoch": 45.29090909090909,
|
| 1362 |
+
"grad_norm": 5.991427421569824,
|
| 1363 |
+
"learning_rate": 7.039090644965509e-08,
|
| 1364 |
+
"loss": 0.7047,
|
| 1365 |
+
"step": 181
|
| 1366 |
+
},
|
| 1367 |
+
{
|
| 1368 |
+
"epoch": 45.58181818181818,
|
| 1369 |
+
"grad_norm": 5.386115550994873,
|
| 1370 |
+
"learning_rate": 7.002130866551968e-08,
|
| 1371 |
+
"loss": 0.7113,
|
| 1372 |
+
"step": 182
|
| 1373 |
+
},
|
| 1374 |
+
{
|
| 1375 |
+
"epoch": 45.872727272727275,
|
| 1376 |
+
"grad_norm": 6.815364360809326,
|
| 1377 |
+
"learning_rate": 6.965056695057204e-08,
|
| 1378 |
+
"loss": 0.7255,
|
| 1379 |
+
"step": 183
|
| 1380 |
+
},
|
| 1381 |
+
{
|
| 1382 |
+
"epoch": 46.0,
|
| 1383 |
+
"grad_norm": 6.38714599609375,
|
| 1384 |
+
"learning_rate": 6.927870953822915e-08,
|
| 1385 |
+
"loss": 0.7503,
|
| 1386 |
+
"step": 184
|
| 1387 |
+
},
|
| 1388 |
+
{
|
| 1389 |
+
"epoch": 46.29090909090909,
|
| 1390 |
+
"grad_norm": 5.759856224060059,
|
| 1391 |
+
"learning_rate": 6.890576474687262e-08,
|
| 1392 |
+
"loss": 0.7008,
|
| 1393 |
+
"step": 185
|
| 1394 |
+
},
|
| 1395 |
+
{
|
| 1396 |
+
"epoch": 46.58181818181818,
|
| 1397 |
+
"grad_norm": 5.1396918296813965,
|
| 1398 |
+
"learning_rate": 6.853176097769228e-08,
|
| 1399 |
+
"loss": 0.6925,
|
| 1400 |
+
"step": 186
|
| 1401 |
+
},
|
| 1402 |
+
{
|
| 1403 |
+
"epoch": 46.872727272727275,
|
| 1404 |
+
"grad_norm": 5.9070539474487305,
|
| 1405 |
+
"learning_rate": 6.815672671252315e-08,
|
| 1406 |
+
"loss": 0.7409,
|
| 1407 |
+
"step": 187
|
| 1408 |
+
},
|
| 1409 |
+
{
|
| 1410 |
+
"epoch": 47.0,
|
| 1411 |
+
"grad_norm": 5.90541410446167,
|
| 1412 |
+
"learning_rate": 6.778069051167653e-08,
|
| 1413 |
+
"loss": 0.702,
|
| 1414 |
+
"step": 188
|
| 1415 |
+
},
|
| 1416 |
+
{
|
| 1417 |
+
"epoch": 47.29090909090909,
|
| 1418 |
+
"grad_norm": 5.474076747894287,
|
| 1419 |
+
"learning_rate": 6.740368101176495e-08,
|
| 1420 |
+
"loss": 0.7085,
|
| 1421 |
+
"step": 189
|
| 1422 |
+
},
|
| 1423 |
+
{
|
| 1424 |
+
"epoch": 47.58181818181818,
|
| 1425 |
+
"grad_norm": 5.111520767211914,
|
| 1426 |
+
"learning_rate": 6.702572692352155e-08,
|
| 1427 |
+
"loss": 0.685,
|
| 1428 |
+
"step": 190
|
| 1429 |
+
},
|
| 1430 |
+
{
|
| 1431 |
+
"epoch": 47.872727272727275,
|
| 1432 |
+
"grad_norm": 5.618140697479248,
|
| 1433 |
+
"learning_rate": 6.664685702961344e-08,
|
| 1434 |
+
"loss": 0.7551,
|
| 1435 |
+
"step": 191
|
| 1436 |
+
},
|
| 1437 |
+
{
|
| 1438 |
+
"epoch": 48.0,
|
| 1439 |
+
"grad_norm": 4.961245059967041,
|
| 1440 |
+
"learning_rate": 6.626710018244986e-08,
|
| 1441 |
+
"loss": 0.6327,
|
| 1442 |
+
"step": 192
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"epoch": 48.0,
|
| 1446 |
+
"eval_loss": 0.6752312183380127,
|
| 1447 |
+
"eval_runtime": 0.7832,
|
| 1448 |
+
"eval_samples_per_second": 16.599,
|
| 1449 |
+
"eval_steps_per_second": 16.599,
|
| 1450 |
+
"step": 192
|
| 1451 |
+
},
|
| 1452 |
+
{
|
| 1453 |
+
"epoch": 48.29090909090909,
|
| 1454 |
+
"grad_norm": 5.36975622177124,
|
| 1455 |
+
"learning_rate": 6.588648530198504e-08,
|
| 1456 |
+
"loss": 0.7312,
|
| 1457 |
+
"step": 193
|
| 1458 |
+
},
|
| 1459 |
+
{
|
| 1460 |
+
"epoch": 48.58181818181818,
|
| 1461 |
+
"grad_norm": 5.021007061004639,
|
| 1462 |
+
"learning_rate": 6.550504137351574e-08,
|
| 1463 |
+
"loss": 0.7467,
|
| 1464 |
+
"step": 194
|
| 1465 |
+
},
|
| 1466 |
+
{
|
| 1467 |
+
"epoch": 48.872727272727275,
|
| 1468 |
+
"grad_norm": 4.721583843231201,
|
| 1469 |
+
"learning_rate": 6.512279744547392e-08,
|
| 1470 |
+
"loss": 0.6271,
|
| 1471 |
+
"step": 195
|
| 1472 |
+
},
|
| 1473 |
+
{
|
| 1474 |
+
"epoch": 49.0,
|
| 1475 |
+
"grad_norm": 5.531439304351807,
|
| 1476 |
+
"learning_rate": 6.473978262721462e-08,
|
| 1477 |
+
"loss": 0.7127,
|
| 1478 |
+
"step": 196
|
| 1479 |
+
},
|
| 1480 |
+
{
|
| 1481 |
+
"epoch": 49.29090909090909,
|
| 1482 |
+
"grad_norm": 5.3525309562683105,
|
| 1483 |
+
"learning_rate": 6.435602608679917e-08,
|
| 1484 |
+
"loss": 0.7255,
|
| 1485 |
+
"step": 197
|
| 1486 |
+
},
|
| 1487 |
+
{
|
| 1488 |
+
"epoch": 49.58181818181818,
|
| 1489 |
+
"grad_norm": 4.411137104034424,
|
| 1490 |
+
"learning_rate": 6.397155704877387e-08,
|
| 1491 |
+
"loss": 0.6177,
|
| 1492 |
+
"step": 198
|
| 1493 |
+
},
|
| 1494 |
+
{
|
| 1495 |
+
"epoch": 49.872727272727275,
|
| 1496 |
+
"grad_norm": 4.907252788543701,
|
| 1497 |
+
"learning_rate": 6.358640479194451e-08,
|
| 1498 |
+
"loss": 0.7295,
|
| 1499 |
+
"step": 199
|
| 1500 |
+
},
|
| 1501 |
+
{
|
| 1502 |
+
"epoch": 50.0,
|
| 1503 |
+
"grad_norm": 4.626101493835449,
|
| 1504 |
+
"learning_rate": 6.320059864714664e-08,
|
| 1505 |
+
"loss": 0.7091,
|
| 1506 |
+
"step": 200
|
| 1507 |
+
},
|
| 1508 |
+
{
|
| 1509 |
+
"epoch": 50.29090909090909,
|
| 1510 |
+
"grad_norm": 4.853626728057861,
|
| 1511 |
+
"learning_rate": 6.281416799501187e-08,
|
| 1512 |
+
"loss": 0.7432,
|
| 1513 |
+
"step": 201
|
| 1514 |
+
},
|
| 1515 |
+
{
|
| 1516 |
+
"epoch": 50.58181818181818,
|
| 1517 |
+
"grad_norm": 4.439899921417236,
|
| 1518 |
+
"learning_rate": 6.242714226373049e-08,
|
| 1519 |
+
"loss": 0.676,
|
| 1520 |
+
"step": 202
|
| 1521 |
+
},
|
| 1522 |
+
{
|
| 1523 |
+
"epoch": 50.872727272727275,
|
| 1524 |
+
"grad_norm": 4.5280985832214355,
|
| 1525 |
+
"learning_rate": 6.203955092681039e-08,
|
| 1526 |
+
"loss": 0.7086,
|
| 1527 |
+
"step": 203
|
| 1528 |
+
},
|
| 1529 |
+
{
|
| 1530 |
+
"epoch": 51.0,
|
| 1531 |
+
"grad_norm": 4.414018154144287,
|
| 1532 |
+
"learning_rate": 6.165142350083249e-08,
|
| 1533 |
+
"loss": 0.5264,
|
| 1534 |
+
"step": 204
|
| 1535 |
+
},
|
| 1536 |
+
{
|
| 1537 |
+
"epoch": 51.29090909090909,
|
| 1538 |
+
"grad_norm": 4.17572021484375,
|
| 1539 |
+
"learning_rate": 6.126278954320294e-08,
|
| 1540 |
+
"loss": 0.7346,
|
| 1541 |
+
"step": 205
|
| 1542 |
+
},
|
| 1543 |
+
{
|
| 1544 |
+
"epoch": 51.58181818181818,
|
| 1545 |
+
"grad_norm": 4.015255928039551,
|
| 1546 |
+
"learning_rate": 6.087367864990232e-08,
|
| 1547 |
+
"loss": 0.6239,
|
| 1548 |
+
"step": 206
|
| 1549 |
+
},
|
| 1550 |
+
{
|
| 1551 |
+
"epoch": 51.872727272727275,
|
| 1552 |
+
"grad_norm": 4.698182582855225,
|
| 1553 |
+
"learning_rate": 6.048412045323163e-08,
|
| 1554 |
+
"loss": 0.688,
|
| 1555 |
+
"step": 207
|
| 1556 |
+
},
|
| 1557 |
+
{
|
| 1558 |
+
"epoch": 52.0,
|
| 1559 |
+
"grad_norm": 5.5075297355651855,
|
| 1560 |
+
"learning_rate": 6.00941446195558e-08,
|
| 1561 |
+
"loss": 0.6903,
|
| 1562 |
+
"step": 208
|
| 1563 |
+
},
|
| 1564 |
+
{
|
| 1565 |
+
"epoch": 52.0,
|
| 1566 |
+
"eval_loss": 0.6604220271110535,
|
| 1567 |
+
"eval_runtime": 0.6915,
|
| 1568 |
+
"eval_samples_per_second": 18.8,
|
| 1569 |
+
"eval_steps_per_second": 18.8,
|
| 1570 |
+
"step": 208
|
| 1571 |
}
|
| 1572 |
],
|
| 1573 |
"logging_steps": 1,
|
|
|
|
| 1587 |
"attributes": {}
|
| 1588 |
}
|
| 1589 |
},
|
| 1590 |
+
"total_flos": 6.975970872459264e+16,
|
| 1591 |
"train_batch_size": 1,
|
| 1592 |
"trial_name": null,
|
| 1593 |
"trial_params": null
|