Training in progress, step 70000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0815144315751afde957889b3801664e381aaf78af5aaa224fc2449fb124f643
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1eb8d729362c485fd51b577b67e8426946112c542b67c6dbee290cc17eda6309
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f91f0395ad8bb44fd81f1444330dede040f6b66dbc15e61e2a7fe4c1ef60aa2a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c3a675e6db9104dd282c679a41cb4bdc17a98118d756c948d809458e24a6b37
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12290,11 +12290,189 @@
|
|
| 12290 |
"eval_steps_per_second": 23.529,
|
| 12291 |
"num_input_tokens_seen": 18087936000,
|
| 12292 |
"step": 69000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12293 |
}
|
| 12294 |
],
|
| 12295 |
"logging_steps": 50,
|
| 12296 |
"max_steps": 70000,
|
| 12297 |
-
"num_input_tokens_seen":
|
| 12298 |
"num_train_epochs": 1,
|
| 12299 |
"save_steps": 1000,
|
| 12300 |
"stateful_callbacks": {
|
|
@@ -12304,12 +12482,12 @@
|
|
| 12304 |
"should_evaluate": false,
|
| 12305 |
"should_log": false,
|
| 12306 |
"should_save": true,
|
| 12307 |
-
"should_training_stop":
|
| 12308 |
},
|
| 12309 |
"attributes": {}
|
| 12310 |
}
|
| 12311 |
},
|
| 12312 |
-
"total_flos": 4.
|
| 12313 |
"train_batch_size": 64,
|
| 12314 |
"trial_name": null,
|
| 12315 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.4708549211906576,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 70000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12290 |
"eval_steps_per_second": 23.529,
|
| 12291 |
"num_input_tokens_seen": 18087936000,
|
| 12292 |
"step": 69000
|
| 12293 |
+
},
|
| 12294 |
+
{
|
| 12295 |
+
"epoch": 0.4644647472602129,
|
| 12296 |
+
"grad_norm": 0.1424110382795334,
|
| 12297 |
+
"learning_rate": 8.67336033464411e-06,
|
| 12298 |
+
"loss": 2.9591,
|
| 12299 |
+
"num_input_tokens_seen": 18101043200,
|
| 12300 |
+
"step": 69050
|
| 12301 |
+
},
|
| 12302 |
+
{
|
| 12303 |
+
"epoch": 0.46480107220392053,
|
| 12304 |
+
"grad_norm": 0.14686723053455353,
|
| 12305 |
+
"learning_rate": 7.786715955054202e-06,
|
| 12306 |
+
"loss": 2.9561,
|
| 12307 |
+
"num_input_tokens_seen": 18114150400,
|
| 12308 |
+
"step": 69100
|
| 12309 |
+
},
|
| 12310 |
+
{
|
| 12311 |
+
"epoch": 0.46513739714762814,
|
| 12312 |
+
"grad_norm": 0.13719068467617035,
|
| 12313 |
+
"learning_rate": 6.947512116245669e-06,
|
| 12314 |
+
"loss": 2.9629,
|
| 12315 |
+
"num_input_tokens_seen": 18127257600,
|
| 12316 |
+
"step": 69150
|
| 12317 |
+
},
|
| 12318 |
+
{
|
| 12319 |
+
"epoch": 0.46547372209133575,
|
| 12320 |
+
"grad_norm": 0.14337210357189178,
|
| 12321 |
+
"learning_rate": 6.15582970243117e-06,
|
| 12322 |
+
"loss": 2.9713,
|
| 12323 |
+
"num_input_tokens_seen": 18140364800,
|
| 12324 |
+
"step": 69200
|
| 12325 |
+
},
|
| 12326 |
+
{
|
| 12327 |
+
"epoch": 0.46581004703504336,
|
| 12328 |
+
"grad_norm": 0.18305008113384247,
|
| 12329 |
+
"learning_rate": 5.411745017609493e-06,
|
| 12330 |
+
"loss": 2.9659,
|
| 12331 |
+
"num_input_tokens_seen": 18153472000,
|
| 12332 |
+
"step": 69250
|
| 12333 |
+
},
|
| 12334 |
+
{
|
| 12335 |
+
"epoch": 0.466146371978751,
|
| 12336 |
+
"grad_norm": 0.137322798371315,
|
| 12337 |
+
"learning_rate": 4.715329778211374e-06,
|
| 12338 |
+
"loss": 2.9678,
|
| 12339 |
+
"num_input_tokens_seen": 18166579200,
|
| 12340 |
+
"step": 69300
|
| 12341 |
+
},
|
| 12342 |
+
{
|
| 12343 |
+
"epoch": 0.4664826969224586,
|
| 12344 |
+
"grad_norm": 0.13300293684005737,
|
| 12345 |
+
"learning_rate": 4.066651106186981e-06,
|
| 12346 |
+
"loss": 2.9647,
|
| 12347 |
+
"num_input_tokens_seen": 18179686400,
|
| 12348 |
+
"step": 69350
|
| 12349 |
+
},
|
| 12350 |
+
{
|
| 12351 |
+
"epoch": 0.4668190218661662,
|
| 12352 |
+
"grad_norm": 0.13357709348201752,
|
| 12353 |
+
"learning_rate": 3.4657715225368535e-06,
|
| 12354 |
+
"loss": 2.965,
|
| 12355 |
+
"num_input_tokens_seen": 18192793600,
|
| 12356 |
+
"step": 69400
|
| 12357 |
+
},
|
| 12358 |
+
{
|
| 12359 |
+
"epoch": 0.4671553468098738,
|
| 12360 |
+
"grad_norm": 0.13399702310562134,
|
| 12361 |
+
"learning_rate": 2.9127489412859033e-06,
|
| 12362 |
+
"loss": 2.9614,
|
| 12363 |
+
"num_input_tokens_seen": 18205900800,
|
| 12364 |
+
"step": 69450
|
| 12365 |
+
},
|
| 12366 |
+
{
|
| 12367 |
+
"epoch": 0.4674916717535814,
|
| 12368 |
+
"grad_norm": 0.13703274726867676,
|
| 12369 |
+
"learning_rate": 2.4076366639015913e-06,
|
| 12370 |
+
"loss": 2.964,
|
| 12371 |
+
"num_input_tokens_seen": 18219008000,
|
| 12372 |
+
"step": 69500
|
| 12373 |
+
},
|
| 12374 |
+
{
|
| 12375 |
+
"epoch": 0.4674916717535814,
|
| 12376 |
+
"eval_loss": 2.8645894527435303,
|
| 12377 |
+
"eval_runtime": 53.3524,
|
| 12378 |
+
"eval_samples_per_second": 93.716,
|
| 12379 |
+
"eval_steps_per_second": 23.429,
|
| 12380 |
+
"num_input_tokens_seen": 18219008000,
|
| 12381 |
+
"step": 69500
|
| 12382 |
+
},
|
| 12383 |
+
{
|
| 12384 |
+
"epoch": 0.46782799669728903,
|
| 12385 |
+
"grad_norm": 0.3837803900241852,
|
| 12386 |
+
"learning_rate": 1.950483374156431e-06,
|
| 12387 |
+
"loss": 2.9665,
|
| 12388 |
+
"num_input_tokens_seen": 18232115200,
|
| 12389 |
+
"step": 69550
|
| 12390 |
+
},
|
| 12391 |
+
{
|
| 12392 |
+
"epoch": 0.46816432164099664,
|
| 12393 |
+
"grad_norm": 0.13585589826107025,
|
| 12394 |
+
"learning_rate": 1.541333133436018e-06,
|
| 12395 |
+
"loss": 2.9579,
|
| 12396 |
+
"num_input_tokens_seen": 18245222400,
|
| 12397 |
+
"step": 69600
|
| 12398 |
+
},
|
| 12399 |
+
{
|
| 12400 |
+
"epoch": 0.4685006465847043,
|
| 12401 |
+
"grad_norm": 0.13347585499286652,
|
| 12402 |
+
"learning_rate": 1.18022537649215e-06,
|
| 12403 |
+
"loss": 2.9636,
|
| 12404 |
+
"num_input_tokens_seen": 18258329600,
|
| 12405 |
+
"step": 69650
|
| 12406 |
+
},
|
| 12407 |
+
{
|
| 12408 |
+
"epoch": 0.4688369715284119,
|
| 12409 |
+
"grad_norm": 0.13726544380187988,
|
| 12410 |
+
"learning_rate": 8.671949076420882e-07,
|
| 12411 |
+
"loss": 2.9626,
|
| 12412 |
+
"num_input_tokens_seen": 18271436800,
|
| 12413 |
+
"step": 69700
|
| 12414 |
+
},
|
| 12415 |
+
{
|
| 12416 |
+
"epoch": 0.4691732964721195,
|
| 12417 |
+
"grad_norm": 0.14254987239837646,
|
| 12418 |
+
"learning_rate": 6.022718974137975e-07,
|
| 12419 |
+
"loss": 2.9698,
|
| 12420 |
+
"num_input_tokens_seen": 18284544000,
|
| 12421 |
+
"step": 69750
|
| 12422 |
+
},
|
| 12423 |
+
{
|
| 12424 |
+
"epoch": 0.46950962141582714,
|
| 12425 |
+
"grad_norm": 0.1329219937324524,
|
| 12426 |
+
"learning_rate": 3.854818796385495e-07,
|
| 12427 |
+
"loss": 2.96,
|
| 12428 |
+
"num_input_tokens_seen": 18297651200,
|
| 12429 |
+
"step": 69800
|
| 12430 |
+
},
|
| 12431 |
+
{
|
| 12432 |
+
"epoch": 0.46984594635953475,
|
| 12433 |
+
"grad_norm": 0.1384582668542862,
|
| 12434 |
+
"learning_rate": 2.1684574898939157e-07,
|
| 12435 |
+
"loss": 2.9693,
|
| 12436 |
+
"num_input_tokens_seen": 18310758400,
|
| 12437 |
+
"step": 69850
|
| 12438 |
+
},
|
| 12439 |
+
{
|
| 12440 |
+
"epoch": 0.47018227130324236,
|
| 12441 |
+
"grad_norm": 0.14365264773368835,
|
| 12442 |
+
"learning_rate": 9.637975896759077e-08,
|
| 12443 |
+
"loss": 2.9686,
|
| 12444 |
+
"num_input_tokens_seen": 18323865600,
|
| 12445 |
+
"step": 69900
|
| 12446 |
+
},
|
| 12447 |
+
{
|
| 12448 |
+
"epoch": 0.47051859624694997,
|
| 12449 |
+
"grad_norm": 0.13613733649253845,
|
| 12450 |
+
"learning_rate": 2.4095520335998265e-08,
|
| 12451 |
+
"loss": 2.9607,
|
| 12452 |
+
"num_input_tokens_seen": 18336972800,
|
| 12453 |
+
"step": 69950
|
| 12454 |
+
},
|
| 12455 |
+
{
|
| 12456 |
+
"epoch": 0.4708549211906576,
|
| 12457 |
+
"grad_norm": 0.14377959072589874,
|
| 12458 |
+
"learning_rate": 0.0,
|
| 12459 |
+
"loss": 2.9684,
|
| 12460 |
+
"num_input_tokens_seen": 18350080000,
|
| 12461 |
+
"step": 70000
|
| 12462 |
+
},
|
| 12463 |
+
{
|
| 12464 |
+
"epoch": 0.4708549211906576,
|
| 12465 |
+
"eval_loss": 2.8644959926605225,
|
| 12466 |
+
"eval_runtime": 54.0337,
|
| 12467 |
+
"eval_samples_per_second": 92.535,
|
| 12468 |
+
"eval_steps_per_second": 23.134,
|
| 12469 |
+
"num_input_tokens_seen": 18350080000,
|
| 12470 |
+
"step": 70000
|
| 12471 |
}
|
| 12472 |
],
|
| 12473 |
"logging_steps": 50,
|
| 12474 |
"max_steps": 70000,
|
| 12475 |
+
"num_input_tokens_seen": 18350080000,
|
| 12476 |
"num_train_epochs": 1,
|
| 12477 |
"save_steps": 1000,
|
| 12478 |
"stateful_callbacks": {
|
|
|
|
| 12482 |
"should_evaluate": false,
|
| 12483 |
"should_log": false,
|
| 12484 |
"should_save": true,
|
| 12485 |
+
"should_training_stop": true
|
| 12486 |
},
|
| 12487 |
"attributes": {}
|
| 12488 |
}
|
| 12489 |
},
|
| 12490 |
+
"total_flos": 4.9088254967808e+18,
|
| 12491 |
"train_batch_size": 64,
|
| 12492 |
"trial_name": null,
|
| 12493 |
"trial_params": null
|