Training in progress, step 70000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2cd2d83e8bcf2e24c3dfb835a421fc560d30e2495b3907943e618272cae7419
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:976f02141679563933b488383c3b486345d46612fec4531d1a034013ee84ec05
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f91f0395ad8bb44fd81f1444330dede040f6b66dbc15e61e2a7fe4c1ef60aa2a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c3a675e6db9104dd282c679a41cb4bdc17a98118d756c948d809458e24a6b37
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12290,11 +12290,189 @@
|
|
| 12290 |
"eval_steps_per_second": 23.618,
|
| 12291 |
"num_input_tokens_seen": 18087931456,
|
| 12292 |
"step": 69000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12293 |
}
|
| 12294 |
],
|
| 12295 |
"logging_steps": 50,
|
| 12296 |
"max_steps": 70000,
|
| 12297 |
-
"num_input_tokens_seen":
|
| 12298 |
"num_train_epochs": 1,
|
| 12299 |
"save_steps": 1000,
|
| 12300 |
"stateful_callbacks": {
|
|
@@ -12304,12 +12482,12 @@
|
|
| 12304 |
"should_evaluate": false,
|
| 12305 |
"should_log": false,
|
| 12306 |
"should_save": true,
|
| 12307 |
-
"should_training_stop":
|
| 12308 |
},
|
| 12309 |
"attributes": {}
|
| 12310 |
}
|
| 12311 |
},
|
| 12312 |
-
"total_flos": 4.
|
| 12313 |
"train_batch_size": 64,
|
| 12314 |
"trial_name": null,
|
| 12315 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.333901761331791,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 70000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12290 |
"eval_steps_per_second": 23.618,
|
| 12291 |
"num_input_tokens_seen": 18087931456,
|
| 12292 |
"step": 69000
|
| 12293 |
+
},
|
| 12294 |
+
{
|
| 12295 |
+
"epoch": 0.3293702374280024,
|
| 12296 |
+
"grad_norm": 0.14565595984458923,
|
| 12297 |
+
"learning_rate": 1.1318413143740436e-05,
|
| 12298 |
+
"loss": 2.5358,
|
| 12299 |
+
"num_input_tokens_seen": 18101038656,
|
| 12300 |
+
"step": 69050
|
| 12301 |
+
},
|
| 12302 |
+
{
|
| 12303 |
+
"epoch": 0.3296087386860966,
|
| 12304 |
+
"grad_norm": 0.15810008347034454,
|
| 12305 |
+
"learning_rate": 1.0162300788382261e-05,
|
| 12306 |
+
"loss": 2.5288,
|
| 12307 |
+
"num_input_tokens_seen": 18114145856,
|
| 12308 |
+
"step": 69100
|
| 12309 |
+
},
|
| 12310 |
+
{
|
| 12311 |
+
"epoch": 0.3298472399441907,
|
| 12312 |
+
"grad_norm": 0.14960281550884247,
|
| 12313 |
+
"learning_rate": 9.0678523819408e-06,
|
| 12314 |
+
"loss": 2.5267,
|
| 12315 |
+
"num_input_tokens_seen": 18127253056,
|
| 12316 |
+
"step": 69150
|
| 12317 |
+
},
|
| 12318 |
+
{
|
| 12319 |
+
"epoch": 0.33008574120228484,
|
| 12320 |
+
"grad_norm": 0.14473624527454376,
|
| 12321 |
+
"learning_rate": 8.035205700685167e-06,
|
| 12322 |
+
"loss": 2.5133,
|
| 12323 |
+
"num_input_tokens_seen": 18140360256,
|
| 12324 |
+
"step": 69200
|
| 12325 |
+
},
|
| 12326 |
+
{
|
| 12327 |
+
"epoch": 0.330324242460379,
|
| 12328 |
+
"grad_norm": 0.1450708657503128,
|
| 12329 |
+
"learning_rate": 7.064490740882057e-06,
|
| 12330 |
+
"loss": 2.5302,
|
| 12331 |
+
"num_input_tokens_seen": 18153467456,
|
| 12332 |
+
"step": 69250
|
| 12333 |
+
},
|
| 12334 |
+
{
|
| 12335 |
+
"epoch": 0.3305627437184731,
|
| 12336 |
+
"grad_norm": 0.14883211255073547,
|
| 12337 |
+
"learning_rate": 6.15582970243117e-06,
|
| 12338 |
+
"loss": 2.5307,
|
| 12339 |
+
"num_input_tokens_seen": 18166574656,
|
| 12340 |
+
"step": 69300
|
| 12341 |
+
},
|
| 12342 |
+
{
|
| 12343 |
+
"epoch": 0.33080124497656727,
|
| 12344 |
+
"grad_norm": 0.15696081519126892,
|
| 12345 |
+
"learning_rate": 5.309336973481682e-06,
|
| 12346 |
+
"loss": 2.5341,
|
| 12347 |
+
"num_input_tokens_seen": 18179681856,
|
| 12348 |
+
"step": 69350
|
| 12349 |
+
},
|
| 12350 |
+
{
|
| 12351 |
+
"epoch": 0.33103974623466137,
|
| 12352 |
+
"grad_norm": 0.1564367264509201,
|
| 12353 |
+
"learning_rate": 4.52511911603265e-06,
|
| 12354 |
+
"loss": 2.5299,
|
| 12355 |
+
"num_input_tokens_seen": 18192789056,
|
| 12356 |
+
"step": 69400
|
| 12357 |
+
},
|
| 12358 |
+
{
|
| 12359 |
+
"epoch": 0.33127824749275553,
|
| 12360 |
+
"grad_norm": 0.15558916330337524,
|
| 12361 |
+
"learning_rate": 3.803274852517968e-06,
|
| 12362 |
+
"loss": 2.5197,
|
| 12363 |
+
"num_input_tokens_seen": 18205896256,
|
| 12364 |
+
"step": 69450
|
| 12365 |
+
},
|
| 12366 |
+
{
|
| 12367 |
+
"epoch": 0.3315167487508497,
|
| 12368 |
+
"grad_norm": 0.1532556265592575,
|
| 12369 |
+
"learning_rate": 3.143895053378698e-06,
|
| 12370 |
+
"loss": 2.5176,
|
| 12371 |
+
"num_input_tokens_seen": 18219003456,
|
| 12372 |
+
"step": 69500
|
| 12373 |
+
},
|
| 12374 |
+
{
|
| 12375 |
+
"epoch": 0.3315167487508497,
|
| 12376 |
+
"eval_loss": 2.412046194076538,
|
| 12377 |
+
"eval_runtime": 53.2476,
|
| 12378 |
+
"eval_samples_per_second": 93.901,
|
| 12379 |
+
"eval_steps_per_second": 23.475,
|
| 12380 |
+
"num_input_tokens_seen": 18219003456,
|
| 12381 |
+
"step": 69500
|
| 12382 |
+
},
|
| 12383 |
+
{
|
| 12384 |
+
"epoch": 0.3317552500089438,
|
| 12385 |
+
"grad_norm": 0.1502823829650879,
|
| 12386 |
+
"learning_rate": 2.547062725623828e-06,
|
| 12387 |
+
"loss": 2.5207,
|
| 12388 |
+
"num_input_tokens_seen": 18232110656,
|
| 12389 |
+
"step": 69550
|
| 12390 |
+
},
|
| 12391 |
+
{
|
| 12392 |
+
"epoch": 0.33199375126703795,
|
| 12393 |
+
"grad_norm": 0.1560440957546234,
|
| 12394 |
+
"learning_rate": 2.012853002380466e-06,
|
| 12395 |
+
"loss": 2.5078,
|
| 12396 |
+
"num_input_tokens_seen": 18245217856,
|
| 12397 |
+
"step": 69600
|
| 12398 |
+
},
|
| 12399 |
+
{
|
| 12400 |
+
"epoch": 0.33223225252513205,
|
| 12401 |
+
"grad_norm": 0.15284490585327148,
|
| 12402 |
+
"learning_rate": 1.541333133436018e-06,
|
| 12403 |
+
"loss": 2.5404,
|
| 12404 |
+
"num_input_tokens_seen": 18258325056,
|
| 12405 |
+
"step": 69650
|
| 12406 |
+
},
|
| 12407 |
+
{
|
| 12408 |
+
"epoch": 0.3324707537832262,
|
| 12409 |
+
"grad_norm": 0.14594900608062744,
|
| 12410 |
+
"learning_rate": 1.132562476771959e-06,
|
| 12411 |
+
"loss": 2.5267,
|
| 12412 |
+
"num_input_tokens_seen": 18271432256,
|
| 12413 |
+
"step": 69700
|
| 12414 |
+
},
|
| 12415 |
+
{
|
| 12416 |
+
"epoch": 0.3327092550413203,
|
| 12417 |
+
"grad_norm": 0.15198394656181335,
|
| 12418 |
+
"learning_rate": 7.865924910916978e-07,
|
| 12419 |
+
"loss": 2.5232,
|
| 12420 |
+
"num_input_tokens_seen": 18284539456,
|
| 12421 |
+
"step": 69750
|
| 12422 |
+
},
|
| 12423 |
+
{
|
| 12424 |
+
"epoch": 0.3329477562994145,
|
| 12425 |
+
"grad_norm": 0.15011271834373474,
|
| 12426 |
+
"learning_rate": 5.034667293427053e-07,
|
| 12427 |
+
"loss": 2.5308,
|
| 12428 |
+
"num_input_tokens_seen": 18297646656,
|
| 12429 |
+
"step": 69800
|
| 12430 |
+
},
|
| 12431 |
+
{
|
| 12432 |
+
"epoch": 0.33318625755750864,
|
| 12433 |
+
"grad_norm": 0.147654727101326,
|
| 12434 |
+
"learning_rate": 2.8322083323334415e-07,
|
| 12435 |
+
"loss": 2.5281,
|
| 12436 |
+
"num_input_tokens_seen": 18310753856,
|
| 12437 |
+
"step": 69850
|
| 12438 |
+
},
|
| 12439 |
+
{
|
| 12440 |
+
"epoch": 0.33342475881560274,
|
| 12441 |
+
"grad_norm": 0.15056386590003967,
|
| 12442 |
+
"learning_rate": 1.2588252874673466e-07,
|
| 12443 |
+
"loss": 2.5112,
|
| 12444 |
+
"num_input_tokens_seen": 18323861056,
|
| 12445 |
+
"step": 69900
|
| 12446 |
+
},
|
| 12447 |
+
{
|
| 12448 |
+
"epoch": 0.3336632600736969,
|
| 12449 |
+
"grad_norm": 0.14858213067054749,
|
| 12450 |
+
"learning_rate": 3.147162264971471e-08,
|
| 12451 |
+
"loss": 2.5226,
|
| 12452 |
+
"num_input_tokens_seen": 18336968256,
|
| 12453 |
+
"step": 69950
|
| 12454 |
+
},
|
| 12455 |
+
{
|
| 12456 |
+
"epoch": 0.333901761331791,
|
| 12457 |
+
"grad_norm": 0.1534891128540039,
|
| 12458 |
+
"learning_rate": 0.0,
|
| 12459 |
+
"loss": 2.5303,
|
| 12460 |
+
"num_input_tokens_seen": 18350075456,
|
| 12461 |
+
"step": 70000
|
| 12462 |
+
},
|
| 12463 |
+
{
|
| 12464 |
+
"epoch": 0.333901761331791,
|
| 12465 |
+
"eval_loss": 2.411842107772827,
|
| 12466 |
+
"eval_runtime": 53.9812,
|
| 12467 |
+
"eval_samples_per_second": 92.625,
|
| 12468 |
+
"eval_steps_per_second": 23.156,
|
| 12469 |
+
"num_input_tokens_seen": 18350075456,
|
| 12470 |
+
"step": 70000
|
| 12471 |
}
|
| 12472 |
],
|
| 12473 |
"logging_steps": 50,
|
| 12474 |
"max_steps": 70000,
|
| 12475 |
+
"num_input_tokens_seen": 18350075456,
|
| 12476 |
"num_train_epochs": 1,
|
| 12477 |
"save_steps": 1000,
|
| 12478 |
"stateful_callbacks": {
|
|
|
|
| 12482 |
"should_evaluate": false,
|
| 12483 |
"should_log": false,
|
| 12484 |
"should_save": true,
|
| 12485 |
+
"should_training_stop": true
|
| 12486 |
},
|
| 12487 |
"attributes": {}
|
| 12488 |
}
|
| 12489 |
},
|
| 12490 |
+
"total_flos": 4.908824281216451e+18,
|
| 12491 |
"train_batch_size": 64,
|
| 12492 |
"trial_name": null,
|
| 12493 |
"trial_params": null
|