{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9960988296488944, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034677069787602946, "grad_norm": 4.0249637874471995, "learning_rate": 5e-06, "loss": 1.0488, "step": 10 }, { "epoch": 0.06935413957520589, "grad_norm": 1.7273291496388328, "learning_rate": 5e-06, "loss": 0.9328, "step": 20 }, { "epoch": 0.10403120936280884, "grad_norm": 2.588347397545261, "learning_rate": 5e-06, "loss": 0.8892, "step": 30 }, { "epoch": 0.13870827915041178, "grad_norm": 1.5360583750484056, "learning_rate": 5e-06, "loss": 0.8717, "step": 40 }, { "epoch": 0.17338534893801474, "grad_norm": 1.1396695772844396, "learning_rate": 5e-06, "loss": 0.8475, "step": 50 }, { "epoch": 0.20806241872561768, "grad_norm": 1.0308386132806844, "learning_rate": 5e-06, "loss": 0.8293, "step": 60 }, { "epoch": 0.24273948851322064, "grad_norm": 1.0528463497860754, "learning_rate": 5e-06, "loss": 0.8176, "step": 70 }, { "epoch": 0.27741655830082357, "grad_norm": 1.0138322547682082, "learning_rate": 5e-06, "loss": 0.8068, "step": 80 }, { "epoch": 0.31209362808842656, "grad_norm": 1.0784904725043016, "learning_rate": 5e-06, "loss": 0.7992, "step": 90 }, { "epoch": 0.3467706978760295, "grad_norm": 0.9906052569468718, "learning_rate": 5e-06, "loss": 0.7991, "step": 100 }, { "epoch": 0.3814477676636324, "grad_norm": 1.1832971820048892, "learning_rate": 5e-06, "loss": 0.7894, "step": 110 }, { "epoch": 0.41612483745123535, "grad_norm": 0.773999499704406, "learning_rate": 5e-06, "loss": 0.7832, "step": 120 }, { "epoch": 0.45080190723883834, "grad_norm": 1.1303467514316676, "learning_rate": 5e-06, "loss": 0.7807, "step": 130 }, { "epoch": 0.48547897702644127, "grad_norm": 0.6069961967445706, "learning_rate": 5e-06, "loss": 0.7857, "step": 140 }, { "epoch": 0.5201560468140443, "grad_norm": 0.6816537291509634, "learning_rate": 5e-06, "loss": 0.7779, "step": 150 }, { "epoch": 0.5548331166016471, "grad_norm": 0.7684116239237637, "learning_rate": 5e-06, "loss": 0.7739, "step": 160 }, { "epoch": 0.5895101863892501, "grad_norm": 0.8491777059398649, "learning_rate": 5e-06, "loss": 0.7757, "step": 170 }, { "epoch": 0.6241872561768531, "grad_norm": 1.0896111775038086, "learning_rate": 5e-06, "loss": 0.7675, "step": 180 }, { "epoch": 0.658864325964456, "grad_norm": 0.8510614586902955, "learning_rate": 5e-06, "loss": 0.7732, "step": 190 }, { "epoch": 0.693541395752059, "grad_norm": 0.6064417861325208, "learning_rate": 5e-06, "loss": 0.7677, "step": 200 }, { "epoch": 0.7282184655396619, "grad_norm": 0.5980019145794307, "learning_rate": 5e-06, "loss": 0.7667, "step": 210 }, { "epoch": 0.7628955353272648, "grad_norm": 0.7017739167578199, "learning_rate": 5e-06, "loss": 0.7644, "step": 220 }, { "epoch": 0.7975726051148678, "grad_norm": 0.6027062835922619, "learning_rate": 5e-06, "loss": 0.7664, "step": 230 }, { "epoch": 0.8322496749024707, "grad_norm": 0.6596293739585312, "learning_rate": 5e-06, "loss": 0.7559, "step": 240 }, { "epoch": 0.8669267446900737, "grad_norm": 0.7441210106410687, "learning_rate": 5e-06, "loss": 0.7612, "step": 250 }, { "epoch": 0.9016038144776767, "grad_norm": 0.7762267182363182, "learning_rate": 5e-06, "loss": 0.758, "step": 260 }, { "epoch": 0.9362808842652796, "grad_norm": 0.6385587861511055, "learning_rate": 5e-06, "loss": 0.759, "step": 270 }, { "epoch": 0.9709579540528825, "grad_norm": 0.6361978634019184, "learning_rate": 5e-06, "loss": 0.7583, "step": 280 }, { "epoch": 0.9986996098829649, "eval_loss": 0.753886342048645, "eval_runtime": 197.8618, "eval_samples_per_second": 39.265, "eval_steps_per_second": 0.617, "step": 288 }, { "epoch": 1.0056350238404854, "grad_norm": 0.9912660198339807, "learning_rate": 5e-06, "loss": 0.7976, "step": 290 }, { "epoch": 1.0403120936280885, "grad_norm": 1.535110610753484, "learning_rate": 5e-06, "loss": 0.7106, "step": 300 }, { "epoch": 1.0749891634156914, "grad_norm": 0.7729921866431072, "learning_rate": 5e-06, "loss": 0.7066, "step": 310 }, { "epoch": 1.1096662332032943, "grad_norm": 0.6463456897522241, "learning_rate": 5e-06, "loss": 0.7058, "step": 320 }, { "epoch": 1.1443433029908974, "grad_norm": 0.6035036766400345, "learning_rate": 5e-06, "loss": 0.7031, "step": 330 }, { "epoch": 1.1790203727785002, "grad_norm": 0.7290582812008339, "learning_rate": 5e-06, "loss": 0.6997, "step": 340 }, { "epoch": 1.2136974425661031, "grad_norm": 0.6606339548104747, "learning_rate": 5e-06, "loss": 0.7048, "step": 350 }, { "epoch": 1.2483745123537062, "grad_norm": 0.7747101369350952, "learning_rate": 5e-06, "loss": 0.7049, "step": 360 }, { "epoch": 1.283051582141309, "grad_norm": 0.6447861318282291, "learning_rate": 5e-06, "loss": 0.7025, "step": 370 }, { "epoch": 1.317728651928912, "grad_norm": 0.5446961444193648, "learning_rate": 5e-06, "loss": 0.7071, "step": 380 }, { "epoch": 1.352405721716515, "grad_norm": 0.6316405546311903, "learning_rate": 5e-06, "loss": 0.7062, "step": 390 }, { "epoch": 1.387082791504118, "grad_norm": 0.7714724505395334, "learning_rate": 5e-06, "loss": 0.7013, "step": 400 }, { "epoch": 1.4217598612917208, "grad_norm": 0.6435708845447846, "learning_rate": 5e-06, "loss": 0.7043, "step": 410 }, { "epoch": 1.456436931079324, "grad_norm": 0.6549313245316034, "learning_rate": 5e-06, "loss": 0.705, "step": 420 }, { "epoch": 1.4911140008669268, "grad_norm": 0.6778607750028962, "learning_rate": 5e-06, "loss": 0.7035, "step": 430 }, { "epoch": 1.5257910706545297, "grad_norm": 0.6526217069666287, "learning_rate": 5e-06, "loss": 0.7062, "step": 440 }, { "epoch": 1.5604681404421328, "grad_norm": 0.7252842545795193, "learning_rate": 5e-06, "loss": 0.7056, "step": 450 }, { "epoch": 1.5951452102297354, "grad_norm": 0.6026254905163209, "learning_rate": 5e-06, "loss": 0.7019, "step": 460 }, { "epoch": 1.6298222800173385, "grad_norm": 0.7348281623426512, "learning_rate": 5e-06, "loss": 0.7019, "step": 470 }, { "epoch": 1.6644993498049416, "grad_norm": 0.7277211190473597, "learning_rate": 5e-06, "loss": 0.7057, "step": 480 }, { "epoch": 1.6991764195925443, "grad_norm": 0.8075931048690591, "learning_rate": 5e-06, "loss": 0.7043, "step": 490 }, { "epoch": 1.7338534893801474, "grad_norm": 0.703136267805299, "learning_rate": 5e-06, "loss": 0.704, "step": 500 }, { "epoch": 1.7685305591677505, "grad_norm": 0.6867057678356385, "learning_rate": 5e-06, "loss": 0.7046, "step": 510 }, { "epoch": 1.8032076289553531, "grad_norm": 0.7168492824064608, "learning_rate": 5e-06, "loss": 0.7031, "step": 520 }, { "epoch": 1.8378846987429562, "grad_norm": 0.6468146069695732, "learning_rate": 5e-06, "loss": 0.7026, "step": 530 }, { "epoch": 1.8725617685305593, "grad_norm": 0.5599596596574505, "learning_rate": 5e-06, "loss": 0.7006, "step": 540 }, { "epoch": 1.907238838318162, "grad_norm": 0.7333684396811263, "learning_rate": 5e-06, "loss": 0.7033, "step": 550 }, { "epoch": 1.941915908105765, "grad_norm": 0.6683061963261424, "learning_rate": 5e-06, "loss": 0.6992, "step": 560 }, { "epoch": 1.976592977893368, "grad_norm": 0.655884340314605, "learning_rate": 5e-06, "loss": 0.6958, "step": 570 }, { "epoch": 1.9973992197659298, "eval_loss": 0.7401416897773743, "eval_runtime": 195.3569, "eval_samples_per_second": 39.768, "eval_steps_per_second": 0.624, "step": 576 }, { "epoch": 2.011270047680971, "grad_norm": 1.0541355293769905, "learning_rate": 5e-06, "loss": 0.7266, "step": 580 }, { "epoch": 2.045947117468574, "grad_norm": 0.8515109317795075, "learning_rate": 5e-06, "loss": 0.6496, "step": 590 }, { "epoch": 2.080624187256177, "grad_norm": 0.8609098344070957, "learning_rate": 5e-06, "loss": 0.6459, "step": 600 }, { "epoch": 2.1153012570437797, "grad_norm": 0.6709051306842824, "learning_rate": 5e-06, "loss": 0.6507, "step": 610 }, { "epoch": 2.149978326831383, "grad_norm": 0.6884941523677242, "learning_rate": 5e-06, "loss": 0.6486, "step": 620 }, { "epoch": 2.184655396618986, "grad_norm": 0.6822156803842125, "learning_rate": 5e-06, "loss": 0.6454, "step": 630 }, { "epoch": 2.2193324664065885, "grad_norm": 0.783762019991312, "learning_rate": 5e-06, "loss": 0.6502, "step": 640 }, { "epoch": 2.2540095361941916, "grad_norm": 0.7183875213713674, "learning_rate": 5e-06, "loss": 0.6523, "step": 650 }, { "epoch": 2.2886866059817947, "grad_norm": 0.7034570349597838, "learning_rate": 5e-06, "loss": 0.6512, "step": 660 }, { "epoch": 2.3233636757693974, "grad_norm": 0.6107483226470054, "learning_rate": 5e-06, "loss": 0.6528, "step": 670 }, { "epoch": 2.3580407455570005, "grad_norm": 0.6709721031936152, "learning_rate": 5e-06, "loss": 0.6514, "step": 680 }, { "epoch": 2.3927178153446036, "grad_norm": 0.717931740489821, "learning_rate": 5e-06, "loss": 0.6504, "step": 690 }, { "epoch": 2.4273948851322062, "grad_norm": 0.6775786736254632, "learning_rate": 5e-06, "loss": 0.6568, "step": 700 }, { "epoch": 2.4620719549198093, "grad_norm": 0.6141649062955427, "learning_rate": 5e-06, "loss": 0.6505, "step": 710 }, { "epoch": 2.4967490247074124, "grad_norm": 0.6919942537111052, "learning_rate": 5e-06, "loss": 0.6542, "step": 720 }, { "epoch": 2.531426094495015, "grad_norm": 0.7226456763829804, "learning_rate": 5e-06, "loss": 0.6546, "step": 730 }, { "epoch": 2.566103164282618, "grad_norm": 0.56441351482389, "learning_rate": 5e-06, "loss": 0.6547, "step": 740 }, { "epoch": 2.6007802340702213, "grad_norm": 0.6207794336554665, "learning_rate": 5e-06, "loss": 0.6539, "step": 750 }, { "epoch": 2.635457303857824, "grad_norm": 0.5967792415368525, "learning_rate": 5e-06, "loss": 0.6549, "step": 760 }, { "epoch": 2.670134373645427, "grad_norm": 0.7202470628059912, "learning_rate": 5e-06, "loss": 0.6535, "step": 770 }, { "epoch": 2.70481144343303, "grad_norm": 0.6000428861128503, "learning_rate": 5e-06, "loss": 0.6558, "step": 780 }, { "epoch": 2.739488513220633, "grad_norm": 0.6627746592450424, "learning_rate": 5e-06, "loss": 0.6584, "step": 790 }, { "epoch": 2.774165583008236, "grad_norm": 0.6990438570732993, "learning_rate": 5e-06, "loss": 0.6528, "step": 800 }, { "epoch": 2.808842652795839, "grad_norm": 0.6611463955257642, "learning_rate": 5e-06, "loss": 0.6569, "step": 810 }, { "epoch": 2.8435197225834417, "grad_norm": 0.6625666916962145, "learning_rate": 5e-06, "loss": 0.656, "step": 820 }, { "epoch": 2.8781967923710448, "grad_norm": 0.6263198113296461, "learning_rate": 5e-06, "loss": 0.6535, "step": 830 }, { "epoch": 2.912873862158648, "grad_norm": 0.6889694820528142, "learning_rate": 5e-06, "loss": 0.6523, "step": 840 }, { "epoch": 2.9475509319462505, "grad_norm": 0.6566291665898417, "learning_rate": 5e-06, "loss": 0.6507, "step": 850 }, { "epoch": 2.9822280017338536, "grad_norm": 0.5999353492283839, "learning_rate": 5e-06, "loss": 0.6557, "step": 860 }, { "epoch": 2.9960988296488944, "eval_loss": 0.7398399114608765, "eval_runtime": 196.1227, "eval_samples_per_second": 39.613, "eval_steps_per_second": 0.622, "step": 864 }, { "epoch": 2.9960988296488944, "step": 864, "total_flos": 1447022800404480.0, "train_loss": 0.7206557989120483, "train_runtime": 28697.4972, "train_samples_per_second": 15.431, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1447022800404480.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }