Training in progress, step 7500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fc17cf4778de00f56eb3118cc881d33a3737ff10e3eb03e214d86942e058e70
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3aa3a4015c1121204d1e50a3f1402c43eca6ceea15098443b41437862247a64e
|
| 3 |
size 4768663315
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6db8d4f24a9b059deca696b72055b5814e66617e31dc4227844e631fef5e5cd
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6308,6 +6308,456 @@
|
|
| 6308 |
"mean_token_accuracy": 0.7816780813038349,
|
| 6309 |
"num_tokens": 57342976.0,
|
| 6310 |
"step": 7000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6311 |
}
|
| 6312 |
],
|
| 6313 |
"logging_steps": 10,
|
|
@@ -6327,7 +6777,7 @@
|
|
| 6327 |
"attributes": {}
|
| 6328 |
}
|
| 6329 |
},
|
| 6330 |
-
"total_flos": 1.
|
| 6331 |
"train_batch_size": 2,
|
| 6332 |
"trial_name": null,
|
| 6333 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.12404857152972,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 7500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6308 |
"mean_token_accuracy": 0.7816780813038349,
|
| 6309 |
"num_tokens": 57342976.0,
|
| 6310 |
"step": 7000
|
| 6311 |
+
},
|
| 6312 |
+
{
|
| 6313 |
+
"epoch": 1.9853081743193968,
|
| 6314 |
+
"grad_norm": 1.2141501903533936,
|
| 6315 |
+
"learning_rate": 3.759572013007448e-06,
|
| 6316 |
+
"loss": 0.1356,
|
| 6317 |
+
"mean_token_accuracy": 0.7747798431664705,
|
| 6318 |
+
"num_tokens": 57424896.0,
|
| 6319 |
+
"step": 7010
|
| 6320 |
+
},
|
| 6321 |
+
{
|
| 6322 |
+
"epoch": 1.9881403334867418,
|
| 6323 |
+
"grad_norm": 1.7239429950714111,
|
| 6324 |
+
"learning_rate": 3.7490821357390124e-06,
|
| 6325 |
+
"loss": 0.1708,
|
| 6326 |
+
"mean_token_accuracy": 0.7312010746449232,
|
| 6327 |
+
"num_tokens": 57506816.0,
|
| 6328 |
+
"step": 7020
|
| 6329 |
+
},
|
| 6330 |
+
{
|
| 6331 |
+
"epoch": 1.990972492654087,
|
| 6332 |
+
"grad_norm": 1.4694973230361938,
|
| 6333 |
+
"learning_rate": 3.7385922584705765e-06,
|
| 6334 |
+
"loss": 0.1418,
|
| 6335 |
+
"mean_token_accuracy": 0.7763209376484156,
|
| 6336 |
+
"num_tokens": 57588736.0,
|
| 6337 |
+
"step": 7030
|
| 6338 |
+
},
|
| 6339 |
+
{
|
| 6340 |
+
"epoch": 1.9938046518214323,
|
| 6341 |
+
"grad_norm": 1.2129154205322266,
|
| 6342 |
+
"learning_rate": 3.72810238120214e-06,
|
| 6343 |
+
"loss": 0.1644,
|
| 6344 |
+
"mean_token_accuracy": 0.7523361068218947,
|
| 6345 |
+
"num_tokens": 57670656.0,
|
| 6346 |
+
"step": 7040
|
| 6347 |
+
},
|
| 6348 |
+
{
|
| 6349 |
+
"epoch": 1.9966368109887775,
|
| 6350 |
+
"grad_norm": 1.0555603504180908,
|
| 6351 |
+
"learning_rate": 3.7176125039337042e-06,
|
| 6352 |
+
"loss": 0.1415,
|
| 6353 |
+
"mean_token_accuracy": 0.7746942289173603,
|
| 6354 |
+
"num_tokens": 57752576.0,
|
| 6355 |
+
"step": 7050
|
| 6356 |
+
},
|
| 6357 |
+
{
|
| 6358 |
+
"epoch": 1.9994689701561228,
|
| 6359 |
+
"grad_norm": 1.5597031116485596,
|
| 6360 |
+
"learning_rate": 3.7071226266652683e-06,
|
| 6361 |
+
"loss": 0.1409,
|
| 6362 |
+
"mean_token_accuracy": 0.7636497039347887,
|
| 6363 |
+
"num_tokens": 57834496.0,
|
| 6364 |
+
"step": 7060
|
| 6365 |
+
},
|
| 6366 |
+
{
|
| 6367 |
+
"epoch": 2.002265727333876,
|
| 6368 |
+
"grad_norm": 1.1237660646438599,
|
| 6369 |
+
"learning_rate": 3.6966327493968324e-06,
|
| 6370 |
+
"loss": 0.1065,
|
| 6371 |
+
"mean_token_accuracy": 0.7782952212080171,
|
| 6372 |
+
"num_tokens": 57915392.0,
|
| 6373 |
+
"step": 7070
|
| 6374 |
+
},
|
| 6375 |
+
{
|
| 6376 |
+
"epoch": 2.0050978865012214,
|
| 6377 |
+
"grad_norm": 1.2385786771774292,
|
| 6378 |
+
"learning_rate": 3.6861428721283965e-06,
|
| 6379 |
+
"loss": 0.11,
|
| 6380 |
+
"mean_token_accuracy": 0.7639187891036272,
|
| 6381 |
+
"num_tokens": 57997312.0,
|
| 6382 |
+
"step": 7080
|
| 6383 |
+
},
|
| 6384 |
+
{
|
| 6385 |
+
"epoch": 2.0079300456685667,
|
| 6386 |
+
"grad_norm": 1.055507779121399,
|
| 6387 |
+
"learning_rate": 3.6756529948599605e-06,
|
| 6388 |
+
"loss": 0.113,
|
| 6389 |
+
"mean_token_accuracy": 0.7746575351804494,
|
| 6390 |
+
"num_tokens": 58079232.0,
|
| 6391 |
+
"step": 7090
|
| 6392 |
+
},
|
| 6393 |
+
{
|
| 6394 |
+
"epoch": 2.010762204835912,
|
| 6395 |
+
"grad_norm": 1.1237515211105347,
|
| 6396 |
+
"learning_rate": 3.665163117591524e-06,
|
| 6397 |
+
"loss": 0.0966,
|
| 6398 |
+
"mean_token_accuracy": 0.781531311571598,
|
| 6399 |
+
"num_tokens": 58161152.0,
|
| 6400 |
+
"step": 7100
|
| 6401 |
+
},
|
| 6402 |
+
{
|
| 6403 |
+
"epoch": 2.013594364003257,
|
| 6404 |
+
"grad_norm": 1.7721480131149292,
|
| 6405 |
+
"learning_rate": 3.6546732403230883e-06,
|
| 6406 |
+
"loss": 0.1252,
|
| 6407 |
+
"mean_token_accuracy": 0.761117908358574,
|
| 6408 |
+
"num_tokens": 58243072.0,
|
| 6409 |
+
"step": 7110
|
| 6410 |
+
},
|
| 6411 |
+
{
|
| 6412 |
+
"epoch": 2.016426523170602,
|
| 6413 |
+
"grad_norm": 1.310492992401123,
|
| 6414 |
+
"learning_rate": 3.6441833630546523e-06,
|
| 6415 |
+
"loss": 0.1261,
|
| 6416 |
+
"mean_token_accuracy": 0.7766267094761133,
|
| 6417 |
+
"num_tokens": 58324992.0,
|
| 6418 |
+
"step": 7120
|
| 6419 |
+
},
|
| 6420 |
+
{
|
| 6421 |
+
"epoch": 2.0192586823379473,
|
| 6422 |
+
"grad_norm": 1.3270450830459595,
|
| 6423 |
+
"learning_rate": 3.6336934857862164e-06,
|
| 6424 |
+
"loss": 0.1702,
|
| 6425 |
+
"mean_token_accuracy": 0.729219663143158,
|
| 6426 |
+
"num_tokens": 58406912.0,
|
| 6427 |
+
"step": 7130
|
| 6428 |
+
},
|
| 6429 |
+
{
|
| 6430 |
+
"epoch": 2.0220908415052925,
|
| 6431 |
+
"grad_norm": 1.4443167448043823,
|
| 6432 |
+
"learning_rate": 3.623203608517781e-06,
|
| 6433 |
+
"loss": 0.1288,
|
| 6434 |
+
"mean_token_accuracy": 0.7712328769266605,
|
| 6435 |
+
"num_tokens": 58488832.0,
|
| 6436 |
+
"step": 7140
|
| 6437 |
+
},
|
| 6438 |
+
{
|
| 6439 |
+
"epoch": 2.0249230006726378,
|
| 6440 |
+
"grad_norm": 1.5470432043075562,
|
| 6441 |
+
"learning_rate": 3.612713731249345e-06,
|
| 6442 |
+
"loss": 0.1346,
|
| 6443 |
+
"mean_token_accuracy": 0.7625000022351742,
|
| 6444 |
+
"num_tokens": 58570752.0,
|
| 6445 |
+
"step": 7150
|
| 6446 |
+
},
|
| 6447 |
+
{
|
| 6448 |
+
"epoch": 2.027755159839983,
|
| 6449 |
+
"grad_norm": 1.260907530784607,
|
| 6450 |
+
"learning_rate": 3.6022238539809086e-06,
|
| 6451 |
+
"loss": 0.104,
|
| 6452 |
+
"mean_token_accuracy": 0.7806873768568039,
|
| 6453 |
+
"num_tokens": 58652672.0,
|
| 6454 |
+
"step": 7160
|
| 6455 |
+
},
|
| 6456 |
+
{
|
| 6457 |
+
"epoch": 2.0305873190073283,
|
| 6458 |
+
"grad_norm": 0.9440592527389526,
|
| 6459 |
+
"learning_rate": 3.5917339767124727e-06,
|
| 6460 |
+
"loss": 0.1294,
|
| 6461 |
+
"mean_token_accuracy": 0.75951565541327,
|
| 6462 |
+
"num_tokens": 58734592.0,
|
| 6463 |
+
"step": 7170
|
| 6464 |
+
},
|
| 6465 |
+
{
|
| 6466 |
+
"epoch": 2.0334194781746735,
|
| 6467 |
+
"grad_norm": 1.4341137409210205,
|
| 6468 |
+
"learning_rate": 3.581244099444037e-06,
|
| 6469 |
+
"loss": 0.1188,
|
| 6470 |
+
"mean_token_accuracy": 0.7584393329918384,
|
| 6471 |
+
"num_tokens": 58816512.0,
|
| 6472 |
+
"step": 7180
|
| 6473 |
+
},
|
| 6474 |
+
{
|
| 6475 |
+
"epoch": 2.0362516373420187,
|
| 6476 |
+
"grad_norm": 1.1970900297164917,
|
| 6477 |
+
"learning_rate": 3.570754222175601e-06,
|
| 6478 |
+
"loss": 0.0963,
|
| 6479 |
+
"mean_token_accuracy": 0.7981164366006851,
|
| 6480 |
+
"num_tokens": 58898432.0,
|
| 6481 |
+
"step": 7190
|
| 6482 |
+
},
|
| 6483 |
+
{
|
| 6484 |
+
"epoch": 2.039083796509364,
|
| 6485 |
+
"grad_norm": 1.7112830877304077,
|
| 6486 |
+
"learning_rate": 3.560264344907165e-06,
|
| 6487 |
+
"loss": 0.1528,
|
| 6488 |
+
"mean_token_accuracy": 0.7553693726658821,
|
| 6489 |
+
"num_tokens": 58980352.0,
|
| 6490 |
+
"step": 7200
|
| 6491 |
+
},
|
| 6492 |
+
{
|
| 6493 |
+
"epoch": 2.0419159556767092,
|
| 6494 |
+
"grad_norm": 1.4228155612945557,
|
| 6495 |
+
"learning_rate": 3.549774467638729e-06,
|
| 6496 |
+
"loss": 0.1225,
|
| 6497 |
+
"mean_token_accuracy": 0.7504280801862478,
|
| 6498 |
+
"num_tokens": 59062272.0,
|
| 6499 |
+
"step": 7210
|
| 6500 |
+
},
|
| 6501 |
+
{
|
| 6502 |
+
"epoch": 2.044748114844054,
|
| 6503 |
+
"grad_norm": 1.5878945589065552,
|
| 6504 |
+
"learning_rate": 3.5392845903702927e-06,
|
| 6505 |
+
"loss": 0.1153,
|
| 6506 |
+
"mean_token_accuracy": 0.7910225056111813,
|
| 6507 |
+
"num_tokens": 59144192.0,
|
| 6508 |
+
"step": 7220
|
| 6509 |
+
},
|
| 6510 |
+
{
|
| 6511 |
+
"epoch": 2.0475802740113993,
|
| 6512 |
+
"grad_norm": 1.1206371784210205,
|
| 6513 |
+
"learning_rate": 3.5287947131018567e-06,
|
| 6514 |
+
"loss": 0.1279,
|
| 6515 |
+
"mean_token_accuracy": 0.7498165342956782,
|
| 6516 |
+
"num_tokens": 59226112.0,
|
| 6517 |
+
"step": 7230
|
| 6518 |
+
},
|
| 6519 |
+
{
|
| 6520 |
+
"epoch": 2.0504124331787446,
|
| 6521 |
+
"grad_norm": 1.6621618270874023,
|
| 6522 |
+
"learning_rate": 3.518304835833421e-06,
|
| 6523 |
+
"loss": 0.1217,
|
| 6524 |
+
"mean_token_accuracy": 0.7606286689639091,
|
| 6525 |
+
"num_tokens": 59308032.0,
|
| 6526 |
+
"step": 7240
|
| 6527 |
+
},
|
| 6528 |
+
{
|
| 6529 |
+
"epoch": 2.05324459234609,
|
| 6530 |
+
"grad_norm": 1.387544870376587,
|
| 6531 |
+
"learning_rate": 3.507814958564985e-06,
|
| 6532 |
+
"loss": 0.1446,
|
| 6533 |
+
"mean_token_accuracy": 0.7522015646100044,
|
| 6534 |
+
"num_tokens": 59389952.0,
|
| 6535 |
+
"step": 7250
|
| 6536 |
+
},
|
| 6537 |
+
{
|
| 6538 |
+
"epoch": 2.056076751513435,
|
| 6539 |
+
"grad_norm": 1.303863525390625,
|
| 6540 |
+
"learning_rate": 3.4973250812965494e-06,
|
| 6541 |
+
"loss": 0.1362,
|
| 6542 |
+
"mean_token_accuracy": 0.7659858129918575,
|
| 6543 |
+
"num_tokens": 59471872.0,
|
| 6544 |
+
"step": 7260
|
| 6545 |
+
},
|
| 6546 |
+
{
|
| 6547 |
+
"epoch": 2.0589089106807803,
|
| 6548 |
+
"grad_norm": 1.3478611707687378,
|
| 6549 |
+
"learning_rate": 3.4868352040281135e-06,
|
| 6550 |
+
"loss": 0.1246,
|
| 6551 |
+
"mean_token_accuracy": 0.7809809193015098,
|
| 6552 |
+
"num_tokens": 59553792.0,
|
| 6553 |
+
"step": 7270
|
| 6554 |
+
},
|
| 6555 |
+
{
|
| 6556 |
+
"epoch": 2.0617410698481256,
|
| 6557 |
+
"grad_norm": 1.1989835500717163,
|
| 6558 |
+
"learning_rate": 3.476345326759677e-06,
|
| 6559 |
+
"loss": 0.1085,
|
| 6560 |
+
"mean_token_accuracy": 0.7756604697555304,
|
| 6561 |
+
"num_tokens": 59635712.0,
|
| 6562 |
+
"step": 7280
|
| 6563 |
+
},
|
| 6564 |
+
{
|
| 6565 |
+
"epoch": 2.064573229015471,
|
| 6566 |
+
"grad_norm": 1.1155551671981812,
|
| 6567 |
+
"learning_rate": 3.465855449491241e-06,
|
| 6568 |
+
"loss": 0.1089,
|
| 6569 |
+
"mean_token_accuracy": 0.7770547963678837,
|
| 6570 |
+
"num_tokens": 59717632.0,
|
| 6571 |
+
"step": 7290
|
| 6572 |
+
},
|
| 6573 |
+
{
|
| 6574 |
+
"epoch": 2.067405388182816,
|
| 6575 |
+
"grad_norm": 1.2903120517730713,
|
| 6576 |
+
"learning_rate": 3.4553655722228053e-06,
|
| 6577 |
+
"loss": 0.1234,
|
| 6578 |
+
"mean_token_accuracy": 0.7721501953899861,
|
| 6579 |
+
"num_tokens": 59799552.0,
|
| 6580 |
+
"step": 7300
|
| 6581 |
+
},
|
| 6582 |
+
{
|
| 6583 |
+
"epoch": 2.0702375473501613,
|
| 6584 |
+
"grad_norm": 1.6666812896728516,
|
| 6585 |
+
"learning_rate": 3.4448756949543694e-06,
|
| 6586 |
+
"loss": 0.1253,
|
| 6587 |
+
"mean_token_accuracy": 0.7631360098719597,
|
| 6588 |
+
"num_tokens": 59881472.0,
|
| 6589 |
+
"step": 7310
|
| 6590 |
+
},
|
| 6591 |
+
{
|
| 6592 |
+
"epoch": 2.073069706517506,
|
| 6593 |
+
"grad_norm": 1.2040691375732422,
|
| 6594 |
+
"learning_rate": 3.4343858176859334e-06,
|
| 6595 |
+
"loss": 0.1359,
|
| 6596 |
+
"mean_token_accuracy": 0.7778008833527565,
|
| 6597 |
+
"num_tokens": 59963392.0,
|
| 6598 |
+
"step": 7320
|
| 6599 |
+
},
|
| 6600 |
+
{
|
| 6601 |
+
"epoch": 2.0759018656848514,
|
| 6602 |
+
"grad_norm": 1.2499768733978271,
|
| 6603 |
+
"learning_rate": 3.4238959404174975e-06,
|
| 6604 |
+
"loss": 0.1113,
|
| 6605 |
+
"mean_token_accuracy": 0.7751345403492451,
|
| 6606 |
+
"num_tokens": 60045312.0,
|
| 6607 |
+
"step": 7330
|
| 6608 |
+
},
|
| 6609 |
+
{
|
| 6610 |
+
"epoch": 2.0787340248521966,
|
| 6611 |
+
"grad_norm": 0.9466302990913391,
|
| 6612 |
+
"learning_rate": 3.413406063149061e-06,
|
| 6613 |
+
"loss": 0.1402,
|
| 6614 |
+
"mean_token_accuracy": 0.7586472611874342,
|
| 6615 |
+
"num_tokens": 60127232.0,
|
| 6616 |
+
"step": 7340
|
| 6617 |
+
},
|
| 6618 |
+
{
|
| 6619 |
+
"epoch": 2.081566184019542,
|
| 6620 |
+
"grad_norm": 1.0079811811447144,
|
| 6621 |
+
"learning_rate": 3.4029161858806252e-06,
|
| 6622 |
+
"loss": 0.1033,
|
| 6623 |
+
"mean_token_accuracy": 0.7875611506402492,
|
| 6624 |
+
"num_tokens": 60209152.0,
|
| 6625 |
+
"step": 7350
|
| 6626 |
+
},
|
| 6627 |
+
{
|
| 6628 |
+
"epoch": 2.084398343186887,
|
| 6629 |
+
"grad_norm": 1.493399977684021,
|
| 6630 |
+
"learning_rate": 3.3924263086121893e-06,
|
| 6631 |
+
"loss": 0.1128,
|
| 6632 |
+
"mean_token_accuracy": 0.7791707415133715,
|
| 6633 |
+
"num_tokens": 60291072.0,
|
| 6634 |
+
"step": 7360
|
| 6635 |
+
},
|
| 6636 |
+
{
|
| 6637 |
+
"epoch": 2.0872305023542324,
|
| 6638 |
+
"grad_norm": 1.5899913311004639,
|
| 6639 |
+
"learning_rate": 3.381936431343754e-06,
|
| 6640 |
+
"loss": 0.154,
|
| 6641 |
+
"mean_token_accuracy": 0.7400317970663309,
|
| 6642 |
+
"num_tokens": 60372992.0,
|
| 6643 |
+
"step": 7370
|
| 6644 |
+
},
|
| 6645 |
+
{
|
| 6646 |
+
"epoch": 2.0900626615215776,
|
| 6647 |
+
"grad_norm": 1.5314220190048218,
|
| 6648 |
+
"learning_rate": 3.371446554075318e-06,
|
| 6649 |
+
"loss": 0.1232,
|
| 6650 |
+
"mean_token_accuracy": 0.7905455000698567,
|
| 6651 |
+
"num_tokens": 60454912.0,
|
| 6652 |
+
"step": 7380
|
| 6653 |
+
},
|
| 6654 |
+
{
|
| 6655 |
+
"epoch": 2.092894820688923,
|
| 6656 |
+
"grad_norm": 1.2721341848373413,
|
| 6657 |
+
"learning_rate": 3.360956676806882e-06,
|
| 6658 |
+
"loss": 0.1085,
|
| 6659 |
+
"mean_token_accuracy": 0.7729696653783321,
|
| 6660 |
+
"num_tokens": 60536832.0,
|
| 6661 |
+
"step": 7390
|
| 6662 |
+
},
|
| 6663 |
+
{
|
| 6664 |
+
"epoch": 2.095726979856268,
|
| 6665 |
+
"grad_norm": 1.1642765998840332,
|
| 6666 |
+
"learning_rate": 3.3504667995384456e-06,
|
| 6667 |
+
"loss": 0.1081,
|
| 6668 |
+
"mean_token_accuracy": 0.7889799427241087,
|
| 6669 |
+
"num_tokens": 60618752.0,
|
| 6670 |
+
"step": 7400
|
| 6671 |
+
},
|
| 6672 |
+
{
|
| 6673 |
+
"epoch": 2.098559139023613,
|
| 6674 |
+
"grad_norm": 1.3702284097671509,
|
| 6675 |
+
"learning_rate": 3.3399769222700097e-06,
|
| 6676 |
+
"loss": 0.1094,
|
| 6677 |
+
"mean_token_accuracy": 0.7912671204656363,
|
| 6678 |
+
"num_tokens": 60700672.0,
|
| 6679 |
+
"step": 7410
|
| 6680 |
+
},
|
| 6681 |
+
{
|
| 6682 |
+
"epoch": 2.101391298190958,
|
| 6683 |
+
"grad_norm": 1.2944626808166504,
|
| 6684 |
+
"learning_rate": 3.3294870450015738e-06,
|
| 6685 |
+
"loss": 0.1313,
|
| 6686 |
+
"mean_token_accuracy": 0.7547211341559887,
|
| 6687 |
+
"num_tokens": 60782592.0,
|
| 6688 |
+
"step": 7420
|
| 6689 |
+
},
|
| 6690 |
+
{
|
| 6691 |
+
"epoch": 2.1042234573583034,
|
| 6692 |
+
"grad_norm": 1.1357483863830566,
|
| 6693 |
+
"learning_rate": 3.318997167733138e-06,
|
| 6694 |
+
"loss": 0.1069,
|
| 6695 |
+
"mean_token_accuracy": 0.7714530322700739,
|
| 6696 |
+
"num_tokens": 60864512.0,
|
| 6697 |
+
"step": 7430
|
| 6698 |
+
},
|
| 6699 |
+
{
|
| 6700 |
+
"epoch": 2.1070556165256487,
|
| 6701 |
+
"grad_norm": 1.0823742151260376,
|
| 6702 |
+
"learning_rate": 3.308507290464702e-06,
|
| 6703 |
+
"loss": 0.1106,
|
| 6704 |
+
"mean_token_accuracy": 0.7675146773457527,
|
| 6705 |
+
"num_tokens": 60946432.0,
|
| 6706 |
+
"step": 7440
|
| 6707 |
+
},
|
| 6708 |
+
{
|
| 6709 |
+
"epoch": 2.109887775692994,
|
| 6710 |
+
"grad_norm": 1.2482222318649292,
|
| 6711 |
+
"learning_rate": 3.298017413196266e-06,
|
| 6712 |
+
"loss": 0.1122,
|
| 6713 |
+
"mean_token_accuracy": 0.7684442289173603,
|
| 6714 |
+
"num_tokens": 61028352.0,
|
| 6715 |
+
"step": 7450
|
| 6716 |
+
},
|
| 6717 |
+
{
|
| 6718 |
+
"epoch": 2.112719934860339,
|
| 6719 |
+
"grad_norm": 1.9791706800460815,
|
| 6720 |
+
"learning_rate": 3.2875275359278296e-06,
|
| 6721 |
+
"loss": 0.1222,
|
| 6722 |
+
"mean_token_accuracy": 0.7773728009313345,
|
| 6723 |
+
"num_tokens": 61110272.0,
|
| 6724 |
+
"step": 7460
|
| 6725 |
+
},
|
| 6726 |
+
{
|
| 6727 |
+
"epoch": 2.1155520940276844,
|
| 6728 |
+
"grad_norm": 1.8661694526672363,
|
| 6729 |
+
"learning_rate": 3.2770376586593937e-06,
|
| 6730 |
+
"loss": 0.1091,
|
| 6731 |
+
"mean_token_accuracy": 0.7745596896857023,
|
| 6732 |
+
"num_tokens": 61192192.0,
|
| 6733 |
+
"step": 7470
|
| 6734 |
+
},
|
| 6735 |
+
{
|
| 6736 |
+
"epoch": 2.1183842531950297,
|
| 6737 |
+
"grad_norm": 0.9961443543434143,
|
| 6738 |
+
"learning_rate": 3.2665477813909578e-06,
|
| 6739 |
+
"loss": 0.1212,
|
| 6740 |
+
"mean_token_accuracy": 0.7665973592549562,
|
| 6741 |
+
"num_tokens": 61274112.0,
|
| 6742 |
+
"step": 7480
|
| 6743 |
+
},
|
| 6744 |
+
{
|
| 6745 |
+
"epoch": 2.121216412362375,
|
| 6746 |
+
"grad_norm": 1.1755738258361816,
|
| 6747 |
+
"learning_rate": 3.2560579041225223e-06,
|
| 6748 |
+
"loss": 0.1216,
|
| 6749 |
+
"mean_token_accuracy": 0.7842465721070766,
|
| 6750 |
+
"num_tokens": 61356032.0,
|
| 6751 |
+
"step": 7490
|
| 6752 |
+
},
|
| 6753 |
+
{
|
| 6754 |
+
"epoch": 2.12404857152972,
|
| 6755 |
+
"grad_norm": 1.227830171585083,
|
| 6756 |
+
"learning_rate": 3.2455680268540864e-06,
|
| 6757 |
+
"loss": 0.1372,
|
| 6758 |
+
"mean_token_accuracy": 0.7647749528288841,
|
| 6759 |
+
"num_tokens": 61437952.0,
|
| 6760 |
+
"step": 7500
|
| 6761 |
}
|
| 6762 |
],
|
| 6763 |
"logging_steps": 10,
|
|
|
|
| 6777 |
"attributes": {}
|
| 6778 |
}
|
| 6779 |
},
|
| 6780 |
+
"total_flos": 1.6236851051574067e+17,
|
| 6781 |
"train_batch_size": 2,
|
| 6782 |
"trial_name": null,
|
| 6783 |
"trial_params": null
|