Training in progress, step 115000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4052b3f6dee6acc6e8461ad996dfa79e27245712edf2d1f3321a44a85660ffc
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0922c14e94c809f8792d25d931657f0739836f8872958cc36e36c78337b7886b
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db41ee9f728a0f615e34c377aa1f203a61ceeaf873404658f962d92e3c5c6285
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e98ce821f7f40a728bc6b049ace38a924402d0d066809b7215e9faa83ce3c45c
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -20300,11 +20300,189 @@
|
|
| 20300 |
"eval_steps_per_second": 15.134,
|
| 20301 |
"num_input_tokens_seen": 59759249088,
|
| 20302 |
"step": 114000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20303 |
}
|
| 20304 |
],
|
| 20305 |
"logging_steps": 50,
|
| 20306 |
"max_steps": 140000,
|
| 20307 |
-
"num_input_tokens_seen":
|
| 20308 |
"num_train_epochs": 2,
|
| 20309 |
"save_steps": 1000,
|
| 20310 |
"stateful_callbacks": {
|
|
@@ -20319,7 +20497,7 @@
|
|
| 20319 |
"attributes": {}
|
| 20320 |
}
|
| 20321 |
},
|
| 20322 |
-
"total_flos": 1.
|
| 20323 |
"train_batch_size": 32,
|
| 20324 |
"trial_name": null,
|
| 20325 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0971081722456086,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 115000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 20300 |
"eval_steps_per_second": 15.134,
|
| 20301 |
"num_input_tokens_seen": 59759249088,
|
| 20302 |
"step": 114000
|
| 20303 |
+
},
|
| 20304 |
+
{
|
| 20305 |
+
"epoch": 1.0880451244380314,
|
| 20306 |
+
"grad_norm": 0.1358513981103897,
|
| 20307 |
+
"learning_rate": 0.0009868321189035196,
|
| 20308 |
+
"loss": 2.1057,
|
| 20309 |
+
"num_input_tokens_seen": 59785457920,
|
| 20310 |
+
"step": 114050
|
| 20311 |
+
},
|
| 20312 |
+
{
|
| 20313 |
+
"epoch": 1.0885221269542198,
|
| 20314 |
+
"grad_norm": 0.14738275110721588,
|
| 20315 |
+
"learning_rate": 0.0009861849601988384,
|
| 20316 |
+
"loss": 2.099,
|
| 20317 |
+
"num_input_tokens_seen": 59811672288,
|
| 20318 |
+
"step": 114100
|
| 20319 |
+
},
|
| 20320 |
+
{
|
| 20321 |
+
"epoch": 1.088999129470408,
|
| 20322 |
+
"grad_norm": 0.16324234008789062,
|
| 20323 |
+
"learning_rate": 0.0009855225003441628,
|
| 20324 |
+
"loss": 2.0952,
|
| 20325 |
+
"num_input_tokens_seen": 59837885600,
|
| 20326 |
+
"step": 114150
|
| 20327 |
+
},
|
| 20328 |
+
{
|
| 20329 |
+
"epoch": 1.0894761319865962,
|
| 20330 |
+
"grad_norm": 0.15156808495521545,
|
| 20331 |
+
"learning_rate": 0.0009848447601883434,
|
| 20332 |
+
"loss": 2.1014,
|
| 20333 |
+
"num_input_tokens_seen": 59864099392,
|
| 20334 |
+
"step": 114200
|
| 20335 |
+
},
|
| 20336 |
+
{
|
| 20337 |
+
"epoch": 1.0899531345027844,
|
| 20338 |
+
"grad_norm": 0.14273667335510254,
|
| 20339 |
+
"learning_rate": 0.0009841517610611307,
|
| 20340 |
+
"loss": 2.0898,
|
| 20341 |
+
"num_input_tokens_seen": 59890311072,
|
| 20342 |
+
"step": 114250
|
| 20343 |
+
},
|
| 20344 |
+
{
|
| 20345 |
+
"epoch": 1.0904301370189728,
|
| 20346 |
+
"grad_norm": 0.1409289538860321,
|
| 20347 |
+
"learning_rate": 0.0009834435247725033,
|
| 20348 |
+
"loss": 2.0798,
|
| 20349 |
+
"num_input_tokens_seen": 59916523776,
|
| 20350 |
+
"step": 114300
|
| 20351 |
+
},
|
| 20352 |
+
{
|
| 20353 |
+
"epoch": 1.090907139535161,
|
| 20354 |
+
"grad_norm": 0.13659177720546722,
|
| 20355 |
+
"learning_rate": 0.0009827200736119814,
|
| 20356 |
+
"loss": 2.084,
|
| 20357 |
+
"num_input_tokens_seen": 59942727744,
|
| 20358 |
+
"step": 114350
|
| 20359 |
+
},
|
| 20360 |
+
{
|
| 20361 |
+
"epoch": 1.0913841420513493,
|
| 20362 |
+
"grad_norm": 0.14861910045146942,
|
| 20363 |
+
"learning_rate": 0.0009819814303479266,
|
| 20364 |
+
"loss": 2.1021,
|
| 20365 |
+
"num_input_tokens_seen": 59968942144,
|
| 20366 |
+
"step": 114400
|
| 20367 |
+
},
|
| 20368 |
+
{
|
| 20369 |
+
"epoch": 1.0918611445675377,
|
| 20370 |
+
"grad_norm": 0.13872170448303223,
|
| 20371 |
+
"learning_rate": 0.0009812276182268236,
|
| 20372 |
+
"loss": 2.1001,
|
| 20373 |
+
"num_input_tokens_seen": 59995154848,
|
| 20374 |
+
"step": 114450
|
| 20375 |
+
},
|
| 20376 |
+
{
|
| 20377 |
+
"epoch": 1.092338147083726,
|
| 20378 |
+
"grad_norm": 0.14306657016277313,
|
| 20379 |
+
"learning_rate": 0.00098045866097255,
|
| 20380 |
+
"loss": 2.0837,
|
| 20381 |
+
"num_input_tokens_seen": 60021363392,
|
| 20382 |
+
"step": 114500
|
| 20383 |
+
},
|
| 20384 |
+
{
|
| 20385 |
+
"epoch": 1.092338147083726,
|
| 20386 |
+
"eval_loss": 2.0082569122314453,
|
| 20387 |
+
"eval_runtime": 82.8417,
|
| 20388 |
+
"eval_samples_per_second": 60.356,
|
| 20389 |
+
"eval_steps_per_second": 15.089,
|
| 20390 |
+
"num_input_tokens_seen": 60021363392,
|
| 20391 |
+
"step": 114500
|
| 20392 |
+
},
|
| 20393 |
+
{
|
| 20394 |
+
"epoch": 1.092815149599914,
|
| 20395 |
+
"grad_norm": 0.1300678551197052,
|
| 20396 |
+
"learning_rate": 0.000979674582785628,
|
| 20397 |
+
"loss": 2.0904,
|
| 20398 |
+
"num_input_tokens_seen": 60047570880,
|
| 20399 |
+
"step": 114550
|
| 20400 |
+
},
|
| 20401 |
+
{
|
| 20402 |
+
"epoch": 1.0932921521161023,
|
| 20403 |
+
"grad_norm": 0.1488349586725235,
|
| 20404 |
+
"learning_rate": 0.0009788754083424652,
|
| 20405 |
+
"loss": 2.0969,
|
| 20406 |
+
"num_input_tokens_seen": 60073778944,
|
| 20407 |
+
"step": 114600
|
| 20408 |
+
},
|
| 20409 |
+
{
|
| 20410 |
+
"epoch": 1.0937691546322907,
|
| 20411 |
+
"grad_norm": 0.14389395713806152,
|
| 20412 |
+
"learning_rate": 0.000978061162794576,
|
| 20413 |
+
"loss": 2.0956,
|
| 20414 |
+
"num_input_tokens_seen": 60099993344,
|
| 20415 |
+
"step": 114650
|
| 20416 |
+
},
|
| 20417 |
+
{
|
| 20418 |
+
"epoch": 1.094246157148479,
|
| 20419 |
+
"grad_norm": 0.13556672632694244,
|
| 20420 |
+
"learning_rate": 0.0009772318717677904,
|
| 20421 |
+
"loss": 2.0856,
|
| 20422 |
+
"num_input_tokens_seen": 60126204832,
|
| 20423 |
+
"step": 114700
|
| 20424 |
+
},
|
| 20425 |
+
{
|
| 20426 |
+
"epoch": 1.0947231596646672,
|
| 20427 |
+
"grad_norm": 0.14573290944099426,
|
| 20428 |
+
"learning_rate": 0.0009763875613614481,
|
| 20429 |
+
"loss": 2.083,
|
| 20430 |
+
"num_input_tokens_seen": 60152411456,
|
| 20431 |
+
"step": 114750
|
| 20432 |
+
},
|
| 20433 |
+
{
|
| 20434 |
+
"epoch": 1.0952001621808556,
|
| 20435 |
+
"grad_norm": 0.14349648356437683,
|
| 20436 |
+
"learning_rate": 0.0009755282581475768,
|
| 20437 |
+
"loss": 2.099,
|
| 20438 |
+
"num_input_tokens_seen": 60178616832,
|
| 20439 |
+
"step": 114800
|
| 20440 |
+
},
|
| 20441 |
+
{
|
| 20442 |
+
"epoch": 1.0956771646970438,
|
| 20443 |
+
"grad_norm": 0.1363336592912674,
|
| 20444 |
+
"learning_rate": 0.0009746539891700557,
|
| 20445 |
+
"loss": 2.0941,
|
| 20446 |
+
"num_input_tokens_seen": 60204821568,
|
| 20447 |
+
"step": 114850
|
| 20448 |
+
},
|
| 20449 |
+
{
|
| 20450 |
+
"epoch": 1.096154167213232,
|
| 20451 |
+
"grad_norm": 0.14463187754154205,
|
| 20452 |
+
"learning_rate": 0.0009737647819437645,
|
| 20453 |
+
"loss": 2.0987,
|
| 20454 |
+
"num_input_tokens_seen": 60231035968,
|
| 20455 |
+
"step": 114900
|
| 20456 |
+
},
|
| 20457 |
+
{
|
| 20458 |
+
"epoch": 1.0966311697294202,
|
| 20459 |
+
"grad_norm": 0.14132525026798248,
|
| 20460 |
+
"learning_rate": 0.0009728606644537177,
|
| 20461 |
+
"loss": 2.0954,
|
| 20462 |
+
"num_input_tokens_seen": 60257250368,
|
| 20463 |
+
"step": 114950
|
| 20464 |
+
},
|
| 20465 |
+
{
|
| 20466 |
+
"epoch": 1.0971081722456086,
|
| 20467 |
+
"grad_norm": 0.14640025794506073,
|
| 20468 |
+
"learning_rate": 0.0009719416651541838,
|
| 20469 |
+
"loss": 2.0992,
|
| 20470 |
+
"num_input_tokens_seen": 60283464768,
|
| 20471 |
+
"step": 115000
|
| 20472 |
+
},
|
| 20473 |
+
{
|
| 20474 |
+
"epoch": 1.0971081722456086,
|
| 20475 |
+
"eval_loss": 2.007655620574951,
|
| 20476 |
+
"eval_runtime": 82.4937,
|
| 20477 |
+
"eval_samples_per_second": 60.611,
|
| 20478 |
+
"eval_steps_per_second": 15.153,
|
| 20479 |
+
"num_input_tokens_seen": 60283464768,
|
| 20480 |
+
"step": 115000
|
| 20481 |
}
|
| 20482 |
],
|
| 20483 |
"logging_steps": 50,
|
| 20484 |
"max_steps": 140000,
|
| 20485 |
+
"num_input_tokens_seen": 60283464768,
|
| 20486 |
"num_train_epochs": 2,
|
| 20487 |
"save_steps": 1000,
|
| 20488 |
"stateful_callbacks": {
|
|
|
|
| 20497 |
"attributes": {}
|
| 20498 |
}
|
| 20499 |
},
|
| 20500 |
+
"total_flos": 1.0669073693538632e+20,
|
| 20501 |
"train_batch_size": 32,
|
| 20502 |
"trial_name": null,
|
| 20503 |
"trial_params": null
|