Training in progress, step 132000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9e57d7de320997016d5d2199393f3c6d5ccbb8649da9e46ae713874cd8a8e24
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f36f287a4da99bdaf6e0deca55af9eddec679234fd5785a93d74c5b7275a731
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3da6ad8ffd940afd42f47dbccd6a99fedee37b4e239b9c682223ad1635ee1326
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21aed170a2d0b5ca9750f891383cff878afad1161fd25ef679259c6d8c42258b
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -23326,11 +23326,189 @@
|
|
| 23326 |
"eval_steps_per_second": 15.164,
|
| 23327 |
"num_input_tokens_seen": 68670633664,
|
| 23328 |
"step": 131000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23329 |
}
|
| 23330 |
],
|
| 23331 |
"logging_steps": 50,
|
| 23332 |
"max_steps": 140000,
|
| 23333 |
-
"num_input_tokens_seen":
|
| 23334 |
"num_train_epochs": 2,
|
| 23335 |
"save_steps": 1000,
|
| 23336 |
"stateful_callbacks": {
|
|
@@ -23345,7 +23523,7 @@
|
|
| 23345 |
"attributes": {}
|
| 23346 |
}
|
| 23347 |
},
|
| 23348 |
-
"total_flos": 1.
|
| 23349 |
"train_batch_size": 32,
|
| 23350 |
"trial_name": null,
|
| 23351 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2592890277496214,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 132000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 23326 |
"eval_steps_per_second": 15.164,
|
| 23327 |
"num_input_tokens_seen": 68670633664,
|
| 23328 |
"step": 131000
|
| 23329 |
+
},
|
| 23330 |
+
{
|
| 23331 |
+
"epoch": 1.2502259799420443,
|
| 23332 |
+
"grad_norm": 0.13041457533836365,
|
| 23333 |
+
"learning_rate": 0.000231613104386454,
|
| 23334 |
+
"loss": 2.0362,
|
| 23335 |
+
"num_input_tokens_seen": 68696842016,
|
| 23336 |
+
"step": 131050
|
| 23337 |
+
},
|
| 23338 |
+
{
|
| 23339 |
+
"epoch": 1.2507029824582325,
|
| 23340 |
+
"grad_norm": 0.1306309849023819,
|
| 23341 |
+
"learning_rate": 0.00022925069366813716,
|
| 23342 |
+
"loss": 2.0593,
|
| 23343 |
+
"num_input_tokens_seen": 68723054176,
|
| 23344 |
+
"step": 131100
|
| 23345 |
+
},
|
| 23346 |
+
{
|
| 23347 |
+
"epoch": 1.2511799849744207,
|
| 23348 |
+
"grad_norm": 0.12761172652244568,
|
| 23349 |
+
"learning_rate": 0.00022689680393686457,
|
| 23350 |
+
"loss": 2.0496,
|
| 23351 |
+
"num_input_tokens_seen": 68749263552,
|
| 23352 |
+
"step": 131150
|
| 23353 |
+
},
|
| 23354 |
+
{
|
| 23355 |
+
"epoch": 1.251656987490609,
|
| 23356 |
+
"grad_norm": 0.12187056988477707,
|
| 23357 |
+
"learning_rate": 0.0002245515092739488,
|
| 23358 |
+
"loss": 2.0417,
|
| 23359 |
+
"num_input_tokens_seen": 68775477952,
|
| 23360 |
+
"step": 131200
|
| 23361 |
+
},
|
| 23362 |
+
{
|
| 23363 |
+
"epoch": 1.2521339900067971,
|
| 23364 |
+
"grad_norm": 0.12770666182041168,
|
| 23365 |
+
"learning_rate": 0.00022221488349019903,
|
| 23366 |
+
"loss": 2.0332,
|
| 23367 |
+
"num_input_tokens_seen": 68801692352,
|
| 23368 |
+
"step": 131250
|
| 23369 |
+
},
|
| 23370 |
+
{
|
| 23371 |
+
"epoch": 1.2526109925229856,
|
| 23372 |
+
"grad_norm": 0.13457396626472473,
|
| 23373 |
+
"learning_rate": 0.00021988700012359863,
|
| 23374 |
+
"loss": 2.0393,
|
| 23375 |
+
"num_input_tokens_seen": 68827900832,
|
| 23376 |
+
"step": 131300
|
| 23377 |
+
},
|
| 23378 |
+
{
|
| 23379 |
+
"epoch": 1.2530879950391738,
|
| 23380 |
+
"grad_norm": 0.12845295667648315,
|
| 23381 |
+
"learning_rate": 0.0002175679324369913,
|
| 23382 |
+
"loss": 2.0507,
|
| 23383 |
+
"num_input_tokens_seen": 68854107328,
|
| 23384 |
+
"step": 131350
|
| 23385 |
+
},
|
| 23386 |
+
{
|
| 23387 |
+
"epoch": 1.2535649975553622,
|
| 23388 |
+
"grad_norm": 0.12990029156208038,
|
| 23389 |
+
"learning_rate": 0.00021525775341577403,
|
| 23390 |
+
"loss": 2.0373,
|
| 23391 |
+
"num_input_tokens_seen": 68880316256,
|
| 23392 |
+
"step": 131400
|
| 23393 |
+
},
|
| 23394 |
+
{
|
| 23395 |
+
"epoch": 1.2540420000715504,
|
| 23396 |
+
"grad_norm": 0.12344187498092651,
|
| 23397 |
+
"learning_rate": 0.00021295653576560165,
|
| 23398 |
+
"loss": 2.0359,
|
| 23399 |
+
"num_input_tokens_seen": 68906521376,
|
| 23400 |
+
"step": 131450
|
| 23401 |
+
},
|
| 23402 |
+
{
|
| 23403 |
+
"epoch": 1.2545190025877386,
|
| 23404 |
+
"grad_norm": 0.12487955391407013,
|
| 23405 |
+
"learning_rate": 0.00021066435191009715,
|
| 23406 |
+
"loss": 2.0432,
|
| 23407 |
+
"num_input_tokens_seen": 68932735776,
|
| 23408 |
+
"step": 131500
|
| 23409 |
+
},
|
| 23410 |
+
{
|
| 23411 |
+
"epoch": 1.2545190025877386,
|
| 23412 |
+
"eval_loss": 1.9613933563232422,
|
| 23413 |
+
"eval_runtime": 82.9225,
|
| 23414 |
+
"eval_samples_per_second": 60.297,
|
| 23415 |
+
"eval_steps_per_second": 15.074,
|
| 23416 |
+
"num_input_tokens_seen": 68932735776,
|
| 23417 |
+
"step": 131500
|
| 23418 |
+
},
|
| 23419 |
+
{
|
| 23420 |
+
"epoch": 1.2549960051039268,
|
| 23421 |
+
"grad_norm": 0.13224980235099792,
|
| 23422 |
+
"learning_rate": 0.00020838127398857382,
|
| 23423 |
+
"loss": 2.0413,
|
| 23424 |
+
"num_input_tokens_seen": 68958946656,
|
| 23425 |
+
"step": 131550
|
| 23426 |
+
},
|
| 23427 |
+
{
|
| 23428 |
+
"epoch": 1.2554730076201153,
|
| 23429 |
+
"grad_norm": 0.12449366599321365,
|
| 23430 |
+
"learning_rate": 0.00020610737385376348,
|
| 23431 |
+
"loss": 2.0503,
|
| 23432 |
+
"num_input_tokens_seen": 68985155520,
|
| 23433 |
+
"step": 131600
|
| 23434 |
+
},
|
| 23435 |
+
{
|
| 23436 |
+
"epoch": 1.2559500101363035,
|
| 23437 |
+
"grad_norm": 0.12943805754184723,
|
| 23438 |
+
"learning_rate": 0.0002038427230695565,
|
| 23439 |
+
"loss": 2.0476,
|
| 23440 |
+
"num_input_tokens_seen": 69011368384,
|
| 23441 |
+
"step": 131650
|
| 23442 |
+
},
|
| 23443 |
+
{
|
| 23444 |
+
"epoch": 1.2564270126524917,
|
| 23445 |
+
"grad_norm": 0.1288331300020218,
|
| 23446 |
+
"learning_rate": 0.00020158739290874821,
|
| 23447 |
+
"loss": 2.0458,
|
| 23448 |
+
"num_input_tokens_seen": 69037580736,
|
| 23449 |
+
"step": 131700
|
| 23450 |
+
},
|
| 23451 |
+
{
|
| 23452 |
+
"epoch": 1.25690401516868,
|
| 23453 |
+
"grad_norm": 0.12655895948410034,
|
| 23454 |
+
"learning_rate": 0.00019934145435079704,
|
| 23455 |
+
"loss": 2.0474,
|
| 23456 |
+
"num_input_tokens_seen": 69063793760,
|
| 23457 |
+
"step": 131750
|
| 23458 |
+
},
|
| 23459 |
+
{
|
| 23460 |
+
"epoch": 1.2573810176848683,
|
| 23461 |
+
"grad_norm": 0.1263783723115921,
|
| 23462 |
+
"learning_rate": 0.0001971049780795901,
|
| 23463 |
+
"loss": 2.0387,
|
| 23464 |
+
"num_input_tokens_seen": 69090002496,
|
| 23465 |
+
"step": 131800
|
| 23466 |
+
},
|
| 23467 |
+
{
|
| 23468 |
+
"epoch": 1.2578580202010565,
|
| 23469 |
+
"grad_norm": 0.13202515244483948,
|
| 23470 |
+
"learning_rate": 0.0001948780344812181,
|
| 23471 |
+
"loss": 2.0531,
|
| 23472 |
+
"num_input_tokens_seen": 69116216896,
|
| 23473 |
+
"step": 131850
|
| 23474 |
+
},
|
| 23475 |
+
{
|
| 23476 |
+
"epoch": 1.2583350227172447,
|
| 23477 |
+
"grad_norm": 0.12061940133571625,
|
| 23478 |
+
"learning_rate": 0.00019266069364176142,
|
| 23479 |
+
"loss": 2.052,
|
| 23480 |
+
"num_input_tokens_seen": 69142427680,
|
| 23481 |
+
"step": 131900
|
| 23482 |
+
},
|
| 23483 |
+
{
|
| 23484 |
+
"epoch": 1.2588120252334332,
|
| 23485 |
+
"grad_norm": 0.1222308874130249,
|
| 23486 |
+
"learning_rate": 0.00019045302534508295,
|
| 23487 |
+
"loss": 2.0409,
|
| 23488 |
+
"num_input_tokens_seen": 69168631136,
|
| 23489 |
+
"step": 131950
|
| 23490 |
+
},
|
| 23491 |
+
{
|
| 23492 |
+
"epoch": 1.2592890277496214,
|
| 23493 |
+
"grad_norm": 0.11664976924657822,
|
| 23494 |
+
"learning_rate": 0.00018825509907063325,
|
| 23495 |
+
"loss": 2.0361,
|
| 23496 |
+
"num_input_tokens_seen": 69194840608,
|
| 23497 |
+
"step": 132000
|
| 23498 |
+
},
|
| 23499 |
+
{
|
| 23500 |
+
"epoch": 1.2592890277496214,
|
| 23501 |
+
"eval_loss": 1.9602855443954468,
|
| 23502 |
+
"eval_runtime": 82.6066,
|
| 23503 |
+
"eval_samples_per_second": 60.528,
|
| 23504 |
+
"eval_steps_per_second": 15.132,
|
| 23505 |
+
"num_input_tokens_seen": 69194840608,
|
| 23506 |
+
"step": 132000
|
| 23507 |
}
|
| 23508 |
],
|
| 23509 |
"logging_steps": 50,
|
| 23510 |
"max_steps": 140000,
|
| 23511 |
+
"num_input_tokens_seen": 69194840608,
|
| 23512 |
"num_train_epochs": 2,
|
| 23513 |
"save_steps": 1000,
|
| 23514 |
"stateful_callbacks": {
|
|
|
|
| 23523 |
"attributes": {}
|
| 23524 |
}
|
| 23525 |
},
|
| 23526 |
+
"total_flos": 1.224622467372331e+20,
|
| 23527 |
"train_batch_size": 32,
|
| 23528 |
"trial_name": null,
|
| 23529 |
"trial_params": null
|