{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 363, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013793103448275862, "grad_norm": 1.2861284017562866, "learning_rate": 1.3186813186813187e-06, "loss": 1.2808, "step": 5 }, { "epoch": 0.027586206896551724, "grad_norm": 1.0939148664474487, "learning_rate": 2.967032967032967e-06, "loss": 1.3562, "step": 10 }, { "epoch": 0.041379310344827586, "grad_norm": 0.7352843284606934, "learning_rate": 4.615384615384616e-06, "loss": 1.2664, "step": 15 }, { "epoch": 0.05517241379310345, "grad_norm": 0.644305944442749, "learning_rate": 6.2637362637362645e-06, "loss": 1.2978, "step": 20 }, { "epoch": 0.06896551724137931, "grad_norm": 0.5266907215118408, "learning_rate": 7.912087912087913e-06, "loss": 1.3218, "step": 25 }, { "epoch": 0.08275862068965517, "grad_norm": 0.5709158778190613, "learning_rate": 9.56043956043956e-06, "loss": 1.2448, "step": 30 }, { "epoch": 0.09655172413793103, "grad_norm": 0.5846296548843384, "learning_rate": 1.120879120879121e-05, "loss": 1.2439, "step": 35 }, { "epoch": 0.1103448275862069, "grad_norm": 0.4904540479183197, "learning_rate": 1.2857142857142857e-05, "loss": 1.2162, "step": 40 }, { "epoch": 0.12413793103448276, "grad_norm": 0.5115934014320374, "learning_rate": 1.4505494505494506e-05, "loss": 1.2357, "step": 45 }, { "epoch": 0.13793103448275862, "grad_norm": 0.4867129325866699, "learning_rate": 1.6153846153846154e-05, "loss": 1.1822, "step": 50 }, { "epoch": 0.15172413793103448, "grad_norm": 0.6010071039199829, "learning_rate": 1.78021978021978e-05, "loss": 1.1464, "step": 55 }, { "epoch": 0.16551724137931034, "grad_norm": 0.43062254786491394, "learning_rate": 1.9450549450549452e-05, "loss": 1.1763, "step": 60 }, { "epoch": 0.1793103448275862, "grad_norm": 0.5273615717887878, "learning_rate": 2.10989010989011e-05, "loss": 1.1936, "step": 65 }, { "epoch": 0.19310344827586207, "grad_norm": 0.46626806259155273, "learning_rate": 2.2747252747252748e-05, "loss": 1.1823, "step": 70 }, { "epoch": 0.20689655172413793, "grad_norm": 0.5032982230186462, "learning_rate": 2.4395604395604395e-05, "loss": 1.1455, "step": 75 }, { "epoch": 0.2206896551724138, "grad_norm": 0.4601927101612091, "learning_rate": 2.6043956043956046e-05, "loss": 1.126, "step": 80 }, { "epoch": 0.23448275862068965, "grad_norm": 0.5814331769943237, "learning_rate": 2.7692307692307694e-05, "loss": 1.1835, "step": 85 }, { "epoch": 0.2482758620689655, "grad_norm": 0.5030220150947571, "learning_rate": 2.934065934065934e-05, "loss": 1.1253, "step": 90 }, { "epoch": 0.2620689655172414, "grad_norm": 0.5223703980445862, "learning_rate": 2.9999775855589334e-05, "loss": 1.1587, "step": 95 }, { "epoch": 0.27586206896551724, "grad_norm": 0.5958484411239624, "learning_rate": 2.9998406108449657e-05, "loss": 1.0926, "step": 100 }, { "epoch": 0.2896551724137931, "grad_norm": 0.500812292098999, "learning_rate": 2.9995791252416083e-05, "loss": 1.0628, "step": 105 }, { "epoch": 0.30344827586206896, "grad_norm": 0.5202997326850891, "learning_rate": 2.9991931504563725e-05, "loss": 1.1535, "step": 110 }, { "epoch": 0.31724137931034485, "grad_norm": 0.47132763266563416, "learning_rate": 2.9986827185313715e-05, "loss": 1.0873, "step": 115 }, { "epoch": 0.3310344827586207, "grad_norm": 0.5247320532798767, "learning_rate": 2.998047871840664e-05, "loss": 1.0437, "step": 120 }, { "epoch": 0.3448275862068966, "grad_norm": 0.6326524019241333, "learning_rate": 2.9972886630867334e-05, "loss": 1.0278, "step": 125 }, { "epoch": 0.3586206896551724, "grad_norm": 0.6137468814849854, "learning_rate": 2.996405155296116e-05, "loss": 0.9953, "step": 130 }, { "epoch": 0.3724137931034483, "grad_norm": 0.6893109679222107, "learning_rate": 2.995397421814165e-05, "loss": 1.0209, "step": 135 }, { "epoch": 0.38620689655172413, "grad_norm": 0.6077777743339539, "learning_rate": 2.994265546298965e-05, "loss": 0.9781, "step": 140 }, { "epoch": 0.4, "grad_norm": 0.6185184717178345, "learning_rate": 2.993009622714385e-05, "loss": 1.0459, "step": 145 }, { "epoch": 0.41379310344827586, "grad_norm": 0.6799812912940979, "learning_rate": 2.991629755322279e-05, "loss": 1.0188, "step": 150 }, { "epoch": 0.42758620689655175, "grad_norm": 0.6423669457435608, "learning_rate": 2.9901260586738305e-05, "loss": 1.036, "step": 155 }, { "epoch": 0.4413793103448276, "grad_norm": 0.7912122011184692, "learning_rate": 2.9884986576000416e-05, "loss": 0.9727, "step": 160 }, { "epoch": 0.45517241379310347, "grad_norm": 0.6771214604377747, "learning_rate": 2.9867476872013707e-05, "loss": 0.9286, "step": 165 }, { "epoch": 0.4689655172413793, "grad_norm": 0.6864525079727173, "learning_rate": 2.9848732928365188e-05, "loss": 0.9362, "step": 170 }, { "epoch": 0.4827586206896552, "grad_norm": 0.7123166918754578, "learning_rate": 2.98287563011036e-05, "loss": 0.9029, "step": 175 }, { "epoch": 0.496551724137931, "grad_norm": 0.6900257468223572, "learning_rate": 2.9807548648610238e-05, "loss": 0.9367, "step": 180 }, { "epoch": 0.5103448275862069, "grad_norm": 0.7352802157402039, "learning_rate": 2.9785111731461306e-05, "loss": 0.9544, "step": 185 }, { "epoch": 0.5241379310344828, "grad_norm": 0.7163735628128052, "learning_rate": 2.976144741228173e-05, "loss": 0.8899, "step": 190 }, { "epoch": 0.5379310344827586, "grad_norm": 0.7872961759567261, "learning_rate": 2.9736557655590536e-05, "loss": 0.939, "step": 195 }, { "epoch": 0.5517241379310345, "grad_norm": 0.7545033097267151, "learning_rate": 2.9710444527637785e-05, "loss": 0.8738, "step": 200 }, { "epoch": 0.5655172413793104, "grad_norm": 0.8260722756385803, "learning_rate": 2.9683110196233022e-05, "loss": 0.9045, "step": 205 }, { "epoch": 0.5793103448275863, "grad_norm": 0.7296833395957947, "learning_rate": 2.96545569305653e-05, "loss": 0.904, "step": 210 }, { "epoch": 0.593103448275862, "grad_norm": 0.8601064085960388, "learning_rate": 2.9624787101014838e-05, "loss": 0.8197, "step": 215 }, { "epoch": 0.6068965517241379, "grad_norm": 0.8468939065933228, "learning_rate": 2.9593803178956208e-05, "loss": 0.836, "step": 220 }, { "epoch": 0.6206896551724138, "grad_norm": 0.7784414887428284, "learning_rate": 2.9561607736553194e-05, "loss": 0.8486, "step": 225 }, { "epoch": 0.6344827586206897, "grad_norm": 0.9513041973114014, "learning_rate": 2.952820344654524e-05, "loss": 0.8396, "step": 230 }, { "epoch": 0.6482758620689655, "grad_norm": 0.8774826526641846, "learning_rate": 2.9493593082025586e-05, "loss": 0.8223, "step": 235 }, { "epoch": 0.6620689655172414, "grad_norm": 0.8445528149604797, "learning_rate": 2.9457779516211057e-05, "loss": 0.8663, "step": 240 }, { "epoch": 0.6758620689655173, "grad_norm": 1.1822575330734253, "learning_rate": 2.9420765722203522e-05, "loss": 0.8806, "step": 245 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8731847405433655, "learning_rate": 2.9382554772743092e-05, "loss": 0.8232, "step": 250 }, { "epoch": 0.7034482758620689, "grad_norm": 0.9695132970809937, "learning_rate": 2.9343149839953044e-05, "loss": 0.7962, "step": 255 }, { "epoch": 0.7172413793103448, "grad_norm": 0.9002068638801575, "learning_rate": 2.9302554195076462e-05, "loss": 0.812, "step": 260 }, { "epoch": 0.7310344827586207, "grad_norm": 0.9631876349449158, "learning_rate": 2.926077120820468e-05, "loss": 0.7827, "step": 265 }, { "epoch": 0.7448275862068966, "grad_norm": 0.8464678525924683, "learning_rate": 2.9217804347997514e-05, "loss": 0.7673, "step": 270 }, { "epoch": 0.7586206896551724, "grad_norm": 0.8802338242530823, "learning_rate": 2.9173657181395308e-05, "loss": 0.7825, "step": 275 }, { "epoch": 0.7724137931034483, "grad_norm": 0.972342312335968, "learning_rate": 2.912833337332281e-05, "loss": 0.7638, "step": 280 }, { "epoch": 0.7862068965517242, "grad_norm": 0.9336219429969788, "learning_rate": 2.9081836686384934e-05, "loss": 0.7503, "step": 285 }, { "epoch": 0.8, "grad_norm": 0.9606114625930786, "learning_rate": 2.90341709805544e-05, "loss": 0.7615, "step": 290 }, { "epoch": 0.8137931034482758, "grad_norm": 0.9412868618965149, "learning_rate": 2.8985340212851304e-05, "loss": 0.7718, "step": 295 }, { "epoch": 0.8275862068965517, "grad_norm": 1.0660676956176758, "learning_rate": 2.89353484370146e-05, "loss": 0.7839, "step": 300 }, { "epoch": 0.8413793103448276, "grad_norm": 0.9120925068855286, "learning_rate": 2.888419980316559e-05, "loss": 0.7595, "step": 305 }, { "epoch": 0.8551724137931035, "grad_norm": 0.8678473234176636, "learning_rate": 2.88318985574634e-05, "loss": 0.7418, "step": 310 }, { "epoch": 0.8689655172413793, "grad_norm": 1.0073208808898926, "learning_rate": 2.8778449041752463e-05, "loss": 0.7102, "step": 315 }, { "epoch": 0.8827586206896552, "grad_norm": 0.9804509282112122, "learning_rate": 2.8723855693202103e-05, "loss": 0.7199, "step": 320 }, { "epoch": 0.896551724137931, "grad_norm": 1.1153241395950317, "learning_rate": 2.866812304393816e-05, "loss": 0.7347, "step": 325 }, { "epoch": 0.9103448275862069, "grad_norm": 0.9621334671974182, "learning_rate": 2.8611255720666743e-05, "loss": 0.7268, "step": 330 }, { "epoch": 0.9241379310344827, "grad_norm": 1.0361660718917847, "learning_rate": 2.8553258444290155e-05, "loss": 0.6755, "step": 335 }, { "epoch": 0.9379310344827586, "grad_norm": 1.0129109621047974, "learning_rate": 2.8494136029514992e-05, "loss": 0.6995, "step": 340 }, { "epoch": 0.9517241379310345, "grad_norm": 1.198005199432373, "learning_rate": 2.84338933844524e-05, "loss": 0.6903, "step": 345 }, { "epoch": 0.9655172413793104, "grad_norm": 1.121144413948059, "learning_rate": 2.8372535510210694e-05, "loss": 0.6686, "step": 350 }, { "epoch": 0.9793103448275862, "grad_norm": 1.0109779834747314, "learning_rate": 2.8310067500480105e-05, "loss": 0.6309, "step": 355 }, { "epoch": 0.993103448275862, "grad_norm": 1.0399612188339233, "learning_rate": 2.8246494541109985e-05, "loss": 0.7209, "step": 360 } ], "logging_steps": 5, "max_steps": 1815, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.1438772296246886e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }