Training in progress, step 31000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99efb4f925ebae40cd6f793929b87a0ccac0e7b97e6def05084db3705337b811
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aba48d7345e1335acdd811f72ad9602a930b00d7d91d9a11216fc53d7f15cb25
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17ffd9dd4a600ef00ffe7371c71cf7eaaf39e90e97468b4a36b4cc557b2fc5d1
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:277f21680b959b596662b48a96a00aaa486d9a86675c2da90af20e0783552321
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5348,11 +5348,189 @@
|
|
| 5348 |
"eval_steps_per_second": 18.914,
|
| 5349 |
"num_input_tokens_seen": 31457276160,
|
| 5350 |
"step": 30000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5351 |
}
|
| 5352 |
],
|
| 5353 |
"logging_steps": 50,
|
| 5354 |
"max_steps": 200000,
|
| 5355 |
-
"num_input_tokens_seen":
|
| 5356 |
"num_train_epochs": 5,
|
| 5357 |
"save_steps": 1000,
|
| 5358 |
"stateful_callbacks": {
|
|
@@ -5367,7 +5545,7 @@
|
|
| 5367 |
"attributes": {}
|
| 5368 |
}
|
| 5369 |
},
|
| 5370 |
-
"total_flos": 1.
|
| 5371 |
"train_batch_size": 64,
|
| 5372 |
"trial_name": null,
|
| 5373 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.6809473405299582,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 31000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5348 |
"eval_steps_per_second": 18.914,
|
| 5349 |
"num_input_tokens_seen": 31457276160,
|
| 5350 |
"step": 30000
|
| 5351 |
+
},
|
| 5352 |
+
{
|
| 5353 |
+
"epoch": 0.6600795994492015,
|
| 5354 |
+
"grad_norm": 0.14465224742889404,
|
| 5355 |
+
"learning_rate": 0.001,
|
| 5356 |
+
"loss": 2.6657,
|
| 5357 |
+
"num_input_tokens_seen": 31509704960,
|
| 5358 |
+
"step": 30050
|
| 5359 |
+
},
|
| 5360 |
+
{
|
| 5361 |
+
"epoch": 0.6611779016113466,
|
| 5362 |
+
"grad_norm": 0.16096332669258118,
|
| 5363 |
+
"learning_rate": 0.001,
|
| 5364 |
+
"loss": 2.6612,
|
| 5365 |
+
"num_input_tokens_seen": 31562133760,
|
| 5366 |
+
"step": 30100
|
| 5367 |
+
},
|
| 5368 |
+
{
|
| 5369 |
+
"epoch": 0.6622762037734916,
|
| 5370 |
+
"grad_norm": 0.1434296816587448,
|
| 5371 |
+
"learning_rate": 0.001,
|
| 5372 |
+
"loss": 2.6695,
|
| 5373 |
+
"num_input_tokens_seen": 31614562560,
|
| 5374 |
+
"step": 30150
|
| 5375 |
+
},
|
| 5376 |
+
{
|
| 5377 |
+
"epoch": 0.6633745059356367,
|
| 5378 |
+
"grad_norm": 0.13844367861747742,
|
| 5379 |
+
"learning_rate": 0.001,
|
| 5380 |
+
"loss": 2.6649,
|
| 5381 |
+
"num_input_tokens_seen": 31666991360,
|
| 5382 |
+
"step": 30200
|
| 5383 |
+
},
|
| 5384 |
+
{
|
| 5385 |
+
"epoch": 0.6644728080977819,
|
| 5386 |
+
"grad_norm": 0.1579446643590927,
|
| 5387 |
+
"learning_rate": 0.001,
|
| 5388 |
+
"loss": 2.6701,
|
| 5389 |
+
"num_input_tokens_seen": 31719420160,
|
| 5390 |
+
"step": 30250
|
| 5391 |
+
},
|
| 5392 |
+
{
|
| 5393 |
+
"epoch": 0.665571110259927,
|
| 5394 |
+
"grad_norm": 0.1585385501384735,
|
| 5395 |
+
"learning_rate": 0.001,
|
| 5396 |
+
"loss": 2.665,
|
| 5397 |
+
"num_input_tokens_seen": 31771848960,
|
| 5398 |
+
"step": 30300
|
| 5399 |
+
},
|
| 5400 |
+
{
|
| 5401 |
+
"epoch": 0.666669412422072,
|
| 5402 |
+
"grad_norm": 0.18768636882305145,
|
| 5403 |
+
"learning_rate": 0.001,
|
| 5404 |
+
"loss": 2.6708,
|
| 5405 |
+
"num_input_tokens_seen": 31824277760,
|
| 5406 |
+
"step": 30350
|
| 5407 |
+
},
|
| 5408 |
+
{
|
| 5409 |
+
"epoch": 0.6677677145842171,
|
| 5410 |
+
"grad_norm": 0.13027966022491455,
|
| 5411 |
+
"learning_rate": 0.001,
|
| 5412 |
+
"loss": 2.6657,
|
| 5413 |
+
"num_input_tokens_seen": 31876706560,
|
| 5414 |
+
"step": 30400
|
| 5415 |
+
},
|
| 5416 |
+
{
|
| 5417 |
+
"epoch": 0.6688660167463623,
|
| 5418 |
+
"grad_norm": 0.13473722338676453,
|
| 5419 |
+
"learning_rate": 0.001,
|
| 5420 |
+
"loss": 2.6658,
|
| 5421 |
+
"num_input_tokens_seen": 31929135360,
|
| 5422 |
+
"step": 30450
|
| 5423 |
+
},
|
| 5424 |
+
{
|
| 5425 |
+
"epoch": 0.6699643189085073,
|
| 5426 |
+
"grad_norm": 0.14617317914962769,
|
| 5427 |
+
"learning_rate": 0.001,
|
| 5428 |
+
"loss": 2.664,
|
| 5429 |
+
"num_input_tokens_seen": 31981564160,
|
| 5430 |
+
"step": 30500
|
| 5431 |
+
},
|
| 5432 |
+
{
|
| 5433 |
+
"epoch": 0.6699643189085073,
|
| 5434 |
+
"eval_loss": 2.5658769607543945,
|
| 5435 |
+
"eval_runtime": 67.5011,
|
| 5436 |
+
"eval_samples_per_second": 74.073,
|
| 5437 |
+
"eval_steps_per_second": 18.518,
|
| 5438 |
+
"num_input_tokens_seen": 31981564160,
|
| 5439 |
+
"step": 30500
|
| 5440 |
+
},
|
| 5441 |
+
{
|
| 5442 |
+
"epoch": 0.6710626210706524,
|
| 5443 |
+
"grad_norm": 0.14581717550754547,
|
| 5444 |
+
"learning_rate": 0.001,
|
| 5445 |
+
"loss": 2.6654,
|
| 5446 |
+
"num_input_tokens_seen": 32033992960,
|
| 5447 |
+
"step": 30550
|
| 5448 |
+
},
|
| 5449 |
+
{
|
| 5450 |
+
"epoch": 0.6721609232327975,
|
| 5451 |
+
"grad_norm": 0.12281567603349686,
|
| 5452 |
+
"learning_rate": 0.001,
|
| 5453 |
+
"loss": 2.6649,
|
| 5454 |
+
"num_input_tokens_seen": 32086421760,
|
| 5455 |
+
"step": 30600
|
| 5456 |
+
},
|
| 5457 |
+
{
|
| 5458 |
+
"epoch": 0.6732592253949425,
|
| 5459 |
+
"grad_norm": 0.14368072152137756,
|
| 5460 |
+
"learning_rate": 0.001,
|
| 5461 |
+
"loss": 2.6605,
|
| 5462 |
+
"num_input_tokens_seen": 32138850560,
|
| 5463 |
+
"step": 30650
|
| 5464 |
+
},
|
| 5465 |
+
{
|
| 5466 |
+
"epoch": 0.6743575275570877,
|
| 5467 |
+
"grad_norm": 0.14596907794475555,
|
| 5468 |
+
"learning_rate": 0.001,
|
| 5469 |
+
"loss": 2.6651,
|
| 5470 |
+
"num_input_tokens_seen": 32191279360,
|
| 5471 |
+
"step": 30700
|
| 5472 |
+
},
|
| 5473 |
+
{
|
| 5474 |
+
"epoch": 0.6754558297192328,
|
| 5475 |
+
"grad_norm": 0.15414392948150635,
|
| 5476 |
+
"learning_rate": 0.001,
|
| 5477 |
+
"loss": 2.6696,
|
| 5478 |
+
"num_input_tokens_seen": 32243708160,
|
| 5479 |
+
"step": 30750
|
| 5480 |
+
},
|
| 5481 |
+
{
|
| 5482 |
+
"epoch": 0.6765541318813779,
|
| 5483 |
+
"grad_norm": 0.14875884354114532,
|
| 5484 |
+
"learning_rate": 0.001,
|
| 5485 |
+
"loss": 2.6662,
|
| 5486 |
+
"num_input_tokens_seen": 32296136960,
|
| 5487 |
+
"step": 30800
|
| 5488 |
+
},
|
| 5489 |
+
{
|
| 5490 |
+
"epoch": 0.6776524340435229,
|
| 5491 |
+
"grad_norm": 0.13774773478507996,
|
| 5492 |
+
"learning_rate": 0.001,
|
| 5493 |
+
"loss": 2.6649,
|
| 5494 |
+
"num_input_tokens_seen": 32348565760,
|
| 5495 |
+
"step": 30850
|
| 5496 |
+
},
|
| 5497 |
+
{
|
| 5498 |
+
"epoch": 0.6787507362056681,
|
| 5499 |
+
"grad_norm": 0.1647578626871109,
|
| 5500 |
+
"learning_rate": 0.001,
|
| 5501 |
+
"loss": 2.6693,
|
| 5502 |
+
"num_input_tokens_seen": 32400994560,
|
| 5503 |
+
"step": 30900
|
| 5504 |
+
},
|
| 5505 |
+
{
|
| 5506 |
+
"epoch": 0.6798490383678132,
|
| 5507 |
+
"grad_norm": 0.1620490700006485,
|
| 5508 |
+
"learning_rate": 0.001,
|
| 5509 |
+
"loss": 2.6726,
|
| 5510 |
+
"num_input_tokens_seen": 32453423360,
|
| 5511 |
+
"step": 30950
|
| 5512 |
+
},
|
| 5513 |
+
{
|
| 5514 |
+
"epoch": 0.6809473405299582,
|
| 5515 |
+
"grad_norm": 0.14238062500953674,
|
| 5516 |
+
"learning_rate": 0.001,
|
| 5517 |
+
"loss": 2.6681,
|
| 5518 |
+
"num_input_tokens_seen": 32505852160,
|
| 5519 |
+
"step": 31000
|
| 5520 |
+
},
|
| 5521 |
+
{
|
| 5522 |
+
"epoch": 0.6809473405299582,
|
| 5523 |
+
"eval_loss": 2.5645763874053955,
|
| 5524 |
+
"eval_runtime": 65.7725,
|
| 5525 |
+
"eval_samples_per_second": 76.02,
|
| 5526 |
+
"eval_steps_per_second": 19.005,
|
| 5527 |
+
"num_input_tokens_seen": 32505852160,
|
| 5528 |
+
"step": 31000
|
| 5529 |
}
|
| 5530 |
],
|
| 5531 |
"logging_steps": 50,
|
| 5532 |
"max_steps": 200000,
|
| 5533 |
+
"num_input_tokens_seen": 32505852160,
|
| 5534 |
"num_train_epochs": 5,
|
| 5535 |
"save_steps": 1000,
|
| 5536 |
"stateful_callbacks": {
|
|
|
|
| 5545 |
"attributes": {}
|
| 5546 |
}
|
| 5547 |
},
|
| 5548 |
+
"total_flos": 1.851232100800463e+19,
|
| 5549 |
"train_batch_size": 64,
|
| 5550 |
"trial_name": null,
|
| 5551 |
"trial_params": null
|