| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 363, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013793103448275862, | |
| "grad_norm": 1.2861284017562866, | |
| "learning_rate": 1.3186813186813187e-06, | |
| "loss": 1.2808, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.027586206896551724, | |
| "grad_norm": 1.0939148664474487, | |
| "learning_rate": 2.967032967032967e-06, | |
| "loss": 1.3562, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.041379310344827586, | |
| "grad_norm": 0.7352843284606934, | |
| "learning_rate": 4.615384615384616e-06, | |
| "loss": 1.2664, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05517241379310345, | |
| "grad_norm": 0.644305944442749, | |
| "learning_rate": 6.2637362637362645e-06, | |
| "loss": 1.2978, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06896551724137931, | |
| "grad_norm": 0.5266907215118408, | |
| "learning_rate": 7.912087912087913e-06, | |
| "loss": 1.3218, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08275862068965517, | |
| "grad_norm": 0.5709158778190613, | |
| "learning_rate": 9.56043956043956e-06, | |
| "loss": 1.2448, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09655172413793103, | |
| "grad_norm": 0.5846296548843384, | |
| "learning_rate": 1.120879120879121e-05, | |
| "loss": 1.2439, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1103448275862069, | |
| "grad_norm": 0.4904540479183197, | |
| "learning_rate": 1.2857142857142857e-05, | |
| "loss": 1.2162, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12413793103448276, | |
| "grad_norm": 0.5115934014320374, | |
| "learning_rate": 1.4505494505494506e-05, | |
| "loss": 1.2357, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.4867129325866699, | |
| "learning_rate": 1.6153846153846154e-05, | |
| "loss": 1.1822, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15172413793103448, | |
| "grad_norm": 0.6010071039199829, | |
| "learning_rate": 1.78021978021978e-05, | |
| "loss": 1.1464, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16551724137931034, | |
| "grad_norm": 0.43062254786491394, | |
| "learning_rate": 1.9450549450549452e-05, | |
| "loss": 1.1763, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1793103448275862, | |
| "grad_norm": 0.5273615717887878, | |
| "learning_rate": 2.10989010989011e-05, | |
| "loss": 1.1936, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.19310344827586207, | |
| "grad_norm": 0.46626806259155273, | |
| "learning_rate": 2.2747252747252748e-05, | |
| "loss": 1.1823, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.20689655172413793, | |
| "grad_norm": 0.5032982230186462, | |
| "learning_rate": 2.4395604395604395e-05, | |
| "loss": 1.1455, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2206896551724138, | |
| "grad_norm": 0.4601927101612091, | |
| "learning_rate": 2.6043956043956046e-05, | |
| "loss": 1.126, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23448275862068965, | |
| "grad_norm": 0.5814331769943237, | |
| "learning_rate": 2.7692307692307694e-05, | |
| "loss": 1.1835, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2482758620689655, | |
| "grad_norm": 0.5030220150947571, | |
| "learning_rate": 2.934065934065934e-05, | |
| "loss": 1.1253, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2620689655172414, | |
| "grad_norm": 0.5223703980445862, | |
| "learning_rate": 2.9999775855589334e-05, | |
| "loss": 1.1587, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.5958484411239624, | |
| "learning_rate": 2.9998406108449657e-05, | |
| "loss": 1.0926, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2896551724137931, | |
| "grad_norm": 0.500812292098999, | |
| "learning_rate": 2.9995791252416083e-05, | |
| "loss": 1.0628, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.30344827586206896, | |
| "grad_norm": 0.5202997326850891, | |
| "learning_rate": 2.9991931504563725e-05, | |
| "loss": 1.1535, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.31724137931034485, | |
| "grad_norm": 0.47132763266563416, | |
| "learning_rate": 2.9986827185313715e-05, | |
| "loss": 1.0873, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3310344827586207, | |
| "grad_norm": 0.5247320532798767, | |
| "learning_rate": 2.998047871840664e-05, | |
| "loss": 1.0437, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 0.6326524019241333, | |
| "learning_rate": 2.9972886630867334e-05, | |
| "loss": 1.0278, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3586206896551724, | |
| "grad_norm": 0.6137468814849854, | |
| "learning_rate": 2.996405155296116e-05, | |
| "loss": 0.9953, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3724137931034483, | |
| "grad_norm": 0.6893109679222107, | |
| "learning_rate": 2.995397421814165e-05, | |
| "loss": 1.0209, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.38620689655172413, | |
| "grad_norm": 0.6077777743339539, | |
| "learning_rate": 2.994265546298965e-05, | |
| "loss": 0.9781, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6185184717178345, | |
| "learning_rate": 2.993009622714385e-05, | |
| "loss": 1.0459, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.6799812912940979, | |
| "learning_rate": 2.991629755322279e-05, | |
| "loss": 1.0188, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.42758620689655175, | |
| "grad_norm": 0.6423669457435608, | |
| "learning_rate": 2.9901260586738305e-05, | |
| "loss": 1.036, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4413793103448276, | |
| "grad_norm": 0.7912122011184692, | |
| "learning_rate": 2.9884986576000416e-05, | |
| "loss": 0.9727, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.45517241379310347, | |
| "grad_norm": 0.6771214604377747, | |
| "learning_rate": 2.9867476872013707e-05, | |
| "loss": 0.9286, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4689655172413793, | |
| "grad_norm": 0.6864525079727173, | |
| "learning_rate": 2.9848732928365188e-05, | |
| "loss": 0.9362, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4827586206896552, | |
| "grad_norm": 0.7123166918754578, | |
| "learning_rate": 2.98287563011036e-05, | |
| "loss": 0.9029, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.496551724137931, | |
| "grad_norm": 0.6900257468223572, | |
| "learning_rate": 2.9807548648610238e-05, | |
| "loss": 0.9367, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5103448275862069, | |
| "grad_norm": 0.7352802157402039, | |
| "learning_rate": 2.9785111731461306e-05, | |
| "loss": 0.9544, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5241379310344828, | |
| "grad_norm": 0.7163735628128052, | |
| "learning_rate": 2.976144741228173e-05, | |
| "loss": 0.8899, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5379310344827586, | |
| "grad_norm": 0.7872961759567261, | |
| "learning_rate": 2.9736557655590536e-05, | |
| "loss": 0.939, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.7545033097267151, | |
| "learning_rate": 2.9710444527637785e-05, | |
| "loss": 0.8738, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5655172413793104, | |
| "grad_norm": 0.8260722756385803, | |
| "learning_rate": 2.9683110196233022e-05, | |
| "loss": 0.9045, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5793103448275863, | |
| "grad_norm": 0.7296833395957947, | |
| "learning_rate": 2.96545569305653e-05, | |
| "loss": 0.904, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.593103448275862, | |
| "grad_norm": 0.8601064085960388, | |
| "learning_rate": 2.9624787101014838e-05, | |
| "loss": 0.8197, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6068965517241379, | |
| "grad_norm": 0.8468939065933228, | |
| "learning_rate": 2.9593803178956208e-05, | |
| "loss": 0.836, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6206896551724138, | |
| "grad_norm": 0.7784414887428284, | |
| "learning_rate": 2.9561607736553194e-05, | |
| "loss": 0.8486, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6344827586206897, | |
| "grad_norm": 0.9513041973114014, | |
| "learning_rate": 2.952820344654524e-05, | |
| "loss": 0.8396, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6482758620689655, | |
| "grad_norm": 0.8774826526641846, | |
| "learning_rate": 2.9493593082025586e-05, | |
| "loss": 0.8223, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6620689655172414, | |
| "grad_norm": 0.8445528149604797, | |
| "learning_rate": 2.9457779516211057e-05, | |
| "loss": 0.8663, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6758620689655173, | |
| "grad_norm": 1.1822575330734253, | |
| "learning_rate": 2.9420765722203522e-05, | |
| "loss": 0.8806, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.8731847405433655, | |
| "learning_rate": 2.9382554772743092e-05, | |
| "loss": 0.8232, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7034482758620689, | |
| "grad_norm": 0.9695132970809937, | |
| "learning_rate": 2.9343149839953044e-05, | |
| "loss": 0.7962, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7172413793103448, | |
| "grad_norm": 0.9002068638801575, | |
| "learning_rate": 2.9302554195076462e-05, | |
| "loss": 0.812, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7310344827586207, | |
| "grad_norm": 0.9631876349449158, | |
| "learning_rate": 2.926077120820468e-05, | |
| "loss": 0.7827, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7448275862068966, | |
| "grad_norm": 0.8464678525924683, | |
| "learning_rate": 2.9217804347997514e-05, | |
| "loss": 0.7673, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7586206896551724, | |
| "grad_norm": 0.8802338242530823, | |
| "learning_rate": 2.9173657181395308e-05, | |
| "loss": 0.7825, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7724137931034483, | |
| "grad_norm": 0.972342312335968, | |
| "learning_rate": 2.912833337332281e-05, | |
| "loss": 0.7638, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7862068965517242, | |
| "grad_norm": 0.9336219429969788, | |
| "learning_rate": 2.9081836686384934e-05, | |
| "loss": 0.7503, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9606114625930786, | |
| "learning_rate": 2.90341709805544e-05, | |
| "loss": 0.7615, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8137931034482758, | |
| "grad_norm": 0.9412868618965149, | |
| "learning_rate": 2.8985340212851304e-05, | |
| "loss": 0.7718, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 1.0660676956176758, | |
| "learning_rate": 2.89353484370146e-05, | |
| "loss": 0.7839, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8413793103448276, | |
| "grad_norm": 0.9120925068855286, | |
| "learning_rate": 2.888419980316559e-05, | |
| "loss": 0.7595, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8551724137931035, | |
| "grad_norm": 0.8678473234176636, | |
| "learning_rate": 2.88318985574634e-05, | |
| "loss": 0.7418, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8689655172413793, | |
| "grad_norm": 1.0073208808898926, | |
| "learning_rate": 2.8778449041752463e-05, | |
| "loss": 0.7102, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8827586206896552, | |
| "grad_norm": 0.9804509282112122, | |
| "learning_rate": 2.8723855693202103e-05, | |
| "loss": 0.7199, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.896551724137931, | |
| "grad_norm": 1.1153241395950317, | |
| "learning_rate": 2.866812304393816e-05, | |
| "loss": 0.7347, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9103448275862069, | |
| "grad_norm": 0.9621334671974182, | |
| "learning_rate": 2.8611255720666743e-05, | |
| "loss": 0.7268, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9241379310344827, | |
| "grad_norm": 1.0361660718917847, | |
| "learning_rate": 2.8553258444290155e-05, | |
| "loss": 0.6755, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9379310344827586, | |
| "grad_norm": 1.0129109621047974, | |
| "learning_rate": 2.8494136029514992e-05, | |
| "loss": 0.6995, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9517241379310345, | |
| "grad_norm": 1.198005199432373, | |
| "learning_rate": 2.84338933844524e-05, | |
| "loss": 0.6903, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 1.121144413948059, | |
| "learning_rate": 2.8372535510210694e-05, | |
| "loss": 0.6686, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9793103448275862, | |
| "grad_norm": 1.0109779834747314, | |
| "learning_rate": 2.8310067500480105e-05, | |
| "loss": 0.6309, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.993103448275862, | |
| "grad_norm": 1.0399612188339233, | |
| "learning_rate": 2.8246494541109985e-05, | |
| "loss": 0.7209, | |
| "step": 360 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1815, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.1438772296246886e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |