Training in progress, step 3000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f81f1606cbb4066658322a9b01b024ebe1fe01d7f9c79d6a2b4af556fe6aa975
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2bdc54e623a858f4b04c457346b0f903dc827e2ac006197959be017f0bd1f45
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11ff07d587c5a9307740887f980afedff8f43c8da2bd4cbf45f5f3cf546cf38d
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a3d374142fb5a9a375b1a828a38137498daacdc810ac93109a9de1e8639e3a1
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -364,11 +364,189 @@
|
|
| 364 |
"eval_steps_per_second": 19.877,
|
| 365 |
"num_input_tokens_seen": 2097152000,
|
| 366 |
"step": 2000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
}
|
| 368 |
],
|
| 369 |
"logging_steps": 50,
|
| 370 |
"max_steps": 200000,
|
| 371 |
-
"num_input_tokens_seen":
|
| 372 |
"num_train_epochs": 5,
|
| 373 |
"save_steps": 1000,
|
| 374 |
"stateful_callbacks": {
|
|
@@ -383,7 +561,7 @@
|
|
| 383 |
"attributes": {}
|
| 384 |
}
|
| 385 |
},
|
| 386 |
-
"total_flos": 1.
|
| 387 |
"train_batch_size": 64,
|
| 388 |
"trial_name": null,
|
| 389 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.06589812972870564,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 3000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 364 |
"eval_steps_per_second": 19.877,
|
| 365 |
"num_input_tokens_seen": 2097152000,
|
| 366 |
"step": 2000
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"epoch": 0.04503038864794885,
|
| 370 |
+
"grad_norm": 0.33002936840057373,
|
| 371 |
+
"learning_rate": 0.00041,
|
| 372 |
+
"loss": 3.5684,
|
| 373 |
+
"num_input_tokens_seen": 2149580800,
|
| 374 |
+
"step": 2050
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 0.04612869081009394,
|
| 378 |
+
"grad_norm": 0.43806758522987366,
|
| 379 |
+
"learning_rate": 0.00042,
|
| 380 |
+
"loss": 3.5436,
|
| 381 |
+
"num_input_tokens_seen": 2202009600,
|
| 382 |
+
"step": 2100
|
| 383 |
+
},
|
| 384 |
+
{
|
| 385 |
+
"epoch": 0.04722699297223904,
|
| 386 |
+
"grad_norm": 0.32842758297920227,
|
| 387 |
+
"learning_rate": 0.00043,
|
| 388 |
+
"loss": 3.5191,
|
| 389 |
+
"num_input_tokens_seen": 2254438400,
|
| 390 |
+
"step": 2150
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"epoch": 0.04832529513438413,
|
| 394 |
+
"grad_norm": 0.3068505525588989,
|
| 395 |
+
"learning_rate": 0.00044,
|
| 396 |
+
"loss": 3.5009,
|
| 397 |
+
"num_input_tokens_seen": 2306867200,
|
| 398 |
+
"step": 2200
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"epoch": 0.049423597296529224,
|
| 402 |
+
"grad_norm": 0.2950410544872284,
|
| 403 |
+
"learning_rate": 0.00045000000000000004,
|
| 404 |
+
"loss": 3.4796,
|
| 405 |
+
"num_input_tokens_seen": 2359296000,
|
| 406 |
+
"step": 2250
|
| 407 |
+
},
|
| 408 |
+
{
|
| 409 |
+
"epoch": 0.05052189945867432,
|
| 410 |
+
"grad_norm": 0.29731425642967224,
|
| 411 |
+
"learning_rate": 0.00046,
|
| 412 |
+
"loss": 3.4583,
|
| 413 |
+
"num_input_tokens_seen": 2411724800,
|
| 414 |
+
"step": 2300
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"epoch": 0.051620201620819414,
|
| 418 |
+
"grad_norm": 0.2702693045139313,
|
| 419 |
+
"learning_rate": 0.00047,
|
| 420 |
+
"loss": 3.4385,
|
| 421 |
+
"num_input_tokens_seen": 2464153600,
|
| 422 |
+
"step": 2350
|
| 423 |
+
},
|
| 424 |
+
{
|
| 425 |
+
"epoch": 0.05271850378296451,
|
| 426 |
+
"grad_norm": 0.2418452948331833,
|
| 427 |
+
"learning_rate": 0.00048,
|
| 428 |
+
"loss": 3.4244,
|
| 429 |
+
"num_input_tokens_seen": 2516582400,
|
| 430 |
+
"step": 2400
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 0.053816805945109604,
|
| 434 |
+
"grad_norm": 0.28668686747550964,
|
| 435 |
+
"learning_rate": 0.00049,
|
| 436 |
+
"loss": 3.3977,
|
| 437 |
+
"num_input_tokens_seen": 2569011200,
|
| 438 |
+
"step": 2450
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"epoch": 0.054915108107254695,
|
| 442 |
+
"grad_norm": 0.3115544319152832,
|
| 443 |
+
"learning_rate": 0.0005,
|
| 444 |
+
"loss": 3.3881,
|
| 445 |
+
"num_input_tokens_seen": 2621440000,
|
| 446 |
+
"step": 2500
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"epoch": 0.054915108107254695,
|
| 450 |
+
"eval_loss": 3.2789928913116455,
|
| 451 |
+
"eval_runtime": 62.6749,
|
| 452 |
+
"eval_samples_per_second": 79.777,
|
| 453 |
+
"eval_steps_per_second": 19.944,
|
| 454 |
+
"num_input_tokens_seen": 2621440000,
|
| 455 |
+
"step": 2500
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"epoch": 0.056013410269399794,
|
| 459 |
+
"grad_norm": 0.32340022921562195,
|
| 460 |
+
"learning_rate": 0.00051,
|
| 461 |
+
"loss": 3.3667,
|
| 462 |
+
"num_input_tokens_seen": 2673868800,
|
| 463 |
+
"step": 2550
|
| 464 |
+
},
|
| 465 |
+
{
|
| 466 |
+
"epoch": 0.057111712431544885,
|
| 467 |
+
"grad_norm": 0.2612442970275879,
|
| 468 |
+
"learning_rate": 0.0005200000000000001,
|
| 469 |
+
"loss": 3.3612,
|
| 470 |
+
"num_input_tokens_seen": 2726297600,
|
| 471 |
+
"step": 2600
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"epoch": 0.05821001459368998,
|
| 475 |
+
"grad_norm": 0.29934820532798767,
|
| 476 |
+
"learning_rate": 0.0005300000000000001,
|
| 477 |
+
"loss": 3.3386,
|
| 478 |
+
"num_input_tokens_seen": 2778726400,
|
| 479 |
+
"step": 2650
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 0.059308316755835075,
|
| 483 |
+
"grad_norm": 0.2737022042274475,
|
| 484 |
+
"learning_rate": 0.00054,
|
| 485 |
+
"loss": 3.3274,
|
| 486 |
+
"num_input_tokens_seen": 2831155200,
|
| 487 |
+
"step": 2700
|
| 488 |
+
},
|
| 489 |
+
{
|
| 490 |
+
"epoch": 0.060406618917980166,
|
| 491 |
+
"grad_norm": 0.2101408988237381,
|
| 492 |
+
"learning_rate": 0.00055,
|
| 493 |
+
"loss": 3.3153,
|
| 494 |
+
"num_input_tokens_seen": 2883584000,
|
| 495 |
+
"step": 2750
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"epoch": 0.061504921080125265,
|
| 499 |
+
"grad_norm": 0.3240911066532135,
|
| 500 |
+
"learning_rate": 0.0005600000000000001,
|
| 501 |
+
"loss": 3.2978,
|
| 502 |
+
"num_input_tokens_seen": 2936012800,
|
| 503 |
+
"step": 2800
|
| 504 |
+
},
|
| 505 |
+
{
|
| 506 |
+
"epoch": 0.06260322324227036,
|
| 507 |
+
"grad_norm": 0.20592735707759857,
|
| 508 |
+
"learning_rate": 0.00057,
|
| 509 |
+
"loss": 3.2984,
|
| 510 |
+
"num_input_tokens_seen": 2988441600,
|
| 511 |
+
"step": 2850
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"epoch": 0.06370152540441545,
|
| 515 |
+
"grad_norm": 0.263443261384964,
|
| 516 |
+
"learning_rate": 0.00058,
|
| 517 |
+
"loss": 3.2706,
|
| 518 |
+
"num_input_tokens_seen": 3040870400,
|
| 519 |
+
"step": 2900
|
| 520 |
+
},
|
| 521 |
+
{
|
| 522 |
+
"epoch": 0.06479982756656054,
|
| 523 |
+
"grad_norm": 0.24249990284442902,
|
| 524 |
+
"learning_rate": 0.00059,
|
| 525 |
+
"loss": 3.2673,
|
| 526 |
+
"num_input_tokens_seen": 3093299200,
|
| 527 |
+
"step": 2950
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"epoch": 0.06589812972870564,
|
| 531 |
+
"grad_norm": 0.25961214303970337,
|
| 532 |
+
"learning_rate": 0.0006,
|
| 533 |
+
"loss": 3.2512,
|
| 534 |
+
"num_input_tokens_seen": 3145728000,
|
| 535 |
+
"step": 3000
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 0.06589812972870564,
|
| 539 |
+
"eval_loss": 3.150442600250244,
|
| 540 |
+
"eval_runtime": 65.9549,
|
| 541 |
+
"eval_samples_per_second": 75.809,
|
| 542 |
+
"eval_steps_per_second": 18.952,
|
| 543 |
+
"num_input_tokens_seen": 3145728000,
|
| 544 |
+
"step": 3000
|
| 545 |
}
|
| 546 |
],
|
| 547 |
"logging_steps": 50,
|
| 548 |
"max_steps": 200000,
|
| 549 |
+
"num_input_tokens_seen": 3145728000,
|
| 550 |
"num_train_epochs": 5,
|
| 551 |
"save_steps": 1000,
|
| 552 |
"stateful_callbacks": {
|
|
|
|
| 561 |
"attributes": {}
|
| 562 |
}
|
| 563 |
},
|
| 564 |
+
"total_flos": 1.791515147894784e+18,
|
| 565 |
"train_batch_size": 64,
|
| 566 |
"trial_name": null,
|
| 567 |
"trial_params": null
|