| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9960988296488944, | |
| "eval_steps": 500, | |
| "global_step": 864, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.034677069787602946, | |
| "grad_norm": 4.0249637874471995, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0488, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06935413957520589, | |
| "grad_norm": 1.7273291496388328, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9328, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10403120936280884, | |
| "grad_norm": 2.588347397545261, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8892, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13870827915041178, | |
| "grad_norm": 1.5360583750484056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8717, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17338534893801474, | |
| "grad_norm": 1.1396695772844396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8475, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20806241872561768, | |
| "grad_norm": 1.0308386132806844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8293, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24273948851322064, | |
| "grad_norm": 1.0528463497860754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8176, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.27741655830082357, | |
| "grad_norm": 1.0138322547682082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8068, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31209362808842656, | |
| "grad_norm": 1.0784904725043016, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7992, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3467706978760295, | |
| "grad_norm": 0.9906052569468718, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7991, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3814477676636324, | |
| "grad_norm": 1.1832971820048892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7894, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41612483745123535, | |
| "grad_norm": 0.773999499704406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7832, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.45080190723883834, | |
| "grad_norm": 1.1303467514316676, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7807, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.48547897702644127, | |
| "grad_norm": 0.6069961967445706, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7857, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5201560468140443, | |
| "grad_norm": 0.6816537291509634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7779, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5548331166016471, | |
| "grad_norm": 0.7684116239237637, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7739, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5895101863892501, | |
| "grad_norm": 0.8491777059398649, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7757, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6241872561768531, | |
| "grad_norm": 1.0896111775038086, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7675, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.658864325964456, | |
| "grad_norm": 0.8510614586902955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7732, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.693541395752059, | |
| "grad_norm": 0.6064417861325208, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7282184655396619, | |
| "grad_norm": 0.5980019145794307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7667, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7628955353272648, | |
| "grad_norm": 0.7017739167578199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7644, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7975726051148678, | |
| "grad_norm": 0.6027062835922619, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7664, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8322496749024707, | |
| "grad_norm": 0.6596293739585312, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7559, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8669267446900737, | |
| "grad_norm": 0.7441210106410687, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7612, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9016038144776767, | |
| "grad_norm": 0.7762267182363182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.758, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9362808842652796, | |
| "grad_norm": 0.6385587861511055, | |
| "learning_rate": 5e-06, | |
| "loss": 0.759, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9709579540528825, | |
| "grad_norm": 0.6361978634019184, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7583, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9986996098829649, | |
| "eval_loss": 0.753886342048645, | |
| "eval_runtime": 197.8618, | |
| "eval_samples_per_second": 39.265, | |
| "eval_steps_per_second": 0.617, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.0056350238404854, | |
| "grad_norm": 0.9912660198339807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7976, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0403120936280885, | |
| "grad_norm": 1.535110610753484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7106, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0749891634156914, | |
| "grad_norm": 0.7729921866431072, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7066, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1096662332032943, | |
| "grad_norm": 0.6463456897522241, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7058, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1443433029908974, | |
| "grad_norm": 0.6035036766400345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7031, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1790203727785002, | |
| "grad_norm": 0.7290582812008339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6997, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2136974425661031, | |
| "grad_norm": 0.6606339548104747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7048, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2483745123537062, | |
| "grad_norm": 0.7747101369350952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7049, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.283051582141309, | |
| "grad_norm": 0.6447861318282291, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7025, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.317728651928912, | |
| "grad_norm": 0.5446961444193648, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7071, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.352405721716515, | |
| "grad_norm": 0.6316405546311903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7062, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.387082791504118, | |
| "grad_norm": 0.7714724505395334, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7013, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4217598612917208, | |
| "grad_norm": 0.6435708845447846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7043, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.456436931079324, | |
| "grad_norm": 0.6549313245316034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.705, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4911140008669268, | |
| "grad_norm": 0.6778607750028962, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7035, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5257910706545297, | |
| "grad_norm": 0.6526217069666287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7062, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5604681404421328, | |
| "grad_norm": 0.7252842545795193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7056, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5951452102297354, | |
| "grad_norm": 0.6026254905163209, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7019, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6298222800173385, | |
| "grad_norm": 0.7348281623426512, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7019, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6644993498049416, | |
| "grad_norm": 0.7277211190473597, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7057, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.6991764195925443, | |
| "grad_norm": 0.8075931048690591, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7043, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7338534893801474, | |
| "grad_norm": 0.703136267805299, | |
| "learning_rate": 5e-06, | |
| "loss": 0.704, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7685305591677505, | |
| "grad_norm": 0.6867057678356385, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7046, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8032076289553531, | |
| "grad_norm": 0.7168492824064608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7031, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8378846987429562, | |
| "grad_norm": 0.6468146069695732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7026, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8725617685305593, | |
| "grad_norm": 0.5599596596574505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7006, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.907238838318162, | |
| "grad_norm": 0.7333684396811263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7033, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.941915908105765, | |
| "grad_norm": 0.6683061963261424, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6992, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.976592977893368, | |
| "grad_norm": 0.655884340314605, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6958, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.9973992197659298, | |
| "eval_loss": 0.7401416897773743, | |
| "eval_runtime": 195.3569, | |
| "eval_samples_per_second": 39.768, | |
| "eval_steps_per_second": 0.624, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.011270047680971, | |
| "grad_norm": 1.0541355293769905, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7266, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.045947117468574, | |
| "grad_norm": 0.8515109317795075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6496, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.080624187256177, | |
| "grad_norm": 0.8609098344070957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6459, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1153012570437797, | |
| "grad_norm": 0.6709051306842824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6507, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.149978326831383, | |
| "grad_norm": 0.6884941523677242, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6486, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.184655396618986, | |
| "grad_norm": 0.6822156803842125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6454, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2193324664065885, | |
| "grad_norm": 0.783762019991312, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6502, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.2540095361941916, | |
| "grad_norm": 0.7183875213713674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6523, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2886866059817947, | |
| "grad_norm": 0.7034570349597838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6512, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3233636757693974, | |
| "grad_norm": 0.6107483226470054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6528, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3580407455570005, | |
| "grad_norm": 0.6709721031936152, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6514, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.3927178153446036, | |
| "grad_norm": 0.717931740489821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6504, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4273948851322062, | |
| "grad_norm": 0.6775786736254632, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6568, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4620719549198093, | |
| "grad_norm": 0.6141649062955427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6505, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.4967490247074124, | |
| "grad_norm": 0.6919942537111052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6542, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.531426094495015, | |
| "grad_norm": 0.7226456763829804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6546, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.566103164282618, | |
| "grad_norm": 0.56441351482389, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6547, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.6007802340702213, | |
| "grad_norm": 0.6207794336554665, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6539, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.635457303857824, | |
| "grad_norm": 0.5967792415368525, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6549, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.670134373645427, | |
| "grad_norm": 0.7202470628059912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6535, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.70481144343303, | |
| "grad_norm": 0.6000428861128503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6558, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.739488513220633, | |
| "grad_norm": 0.6627746592450424, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6584, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.774165583008236, | |
| "grad_norm": 0.6990438570732993, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6528, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.808842652795839, | |
| "grad_norm": 0.6611463955257642, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6569, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8435197225834417, | |
| "grad_norm": 0.6625666916962145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.656, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.8781967923710448, | |
| "grad_norm": 0.6263198113296461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6535, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.912873862158648, | |
| "grad_norm": 0.6889694820528142, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6523, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9475509319462505, | |
| "grad_norm": 0.6566291665898417, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6507, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9822280017338536, | |
| "grad_norm": 0.5999353492283839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6557, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.9960988296488944, | |
| "eval_loss": 0.7398399114608765, | |
| "eval_runtime": 196.1227, | |
| "eval_samples_per_second": 39.613, | |
| "eval_steps_per_second": 0.622, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.9960988296488944, | |
| "step": 864, | |
| "total_flos": 1447022800404480.0, | |
| "train_loss": 0.7206557989120483, | |
| "train_runtime": 28697.4972, | |
| "train_samples_per_second": 15.431, | |
| "train_steps_per_second": 0.03 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 864, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1447022800404480.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |