{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "grad_norm": 0.18415243923664093, "learning_rate": 3.157894736842105e-05, "loss": 0.9158, "step": 6 }, { "epoch": 0.63, "grad_norm": 0.17151106894016266, "learning_rate": 6.31578947368421e-05, "loss": 0.8454, "step": 12 }, { "epoch": 0.95, "grad_norm": 0.253557026386261, "learning_rate": 9.473684210526316e-05, "loss": 0.9068, "step": 18 }, { "epoch": 1.26, "grad_norm": 0.44574955105781555, "learning_rate": 0.0001263157894736842, "loss": 0.781, "step": 24 }, { "epoch": 1.58, "grad_norm": 0.3264688551425934, "learning_rate": 0.00015789473684210527, "loss": 0.6487, "step": 30 }, { "epoch": 1.89, "grad_norm": 0.33505332469940186, "learning_rate": 0.00018947368421052632, "loss": 0.6935, "step": 36 }, { "epoch": 2.21, "grad_norm": 0.3825117349624634, "learning_rate": 0.0001976608187134503, "loss": 0.606, "step": 42 }, { "epoch": 2.53, "grad_norm": 0.31299710273742676, "learning_rate": 0.00019415204678362573, "loss": 0.513, "step": 48 }, { "epoch": 2.84, "grad_norm": 0.5375415682792664, "learning_rate": 0.00019064327485380117, "loss": 0.5365, "step": 54 }, { "epoch": 3.16, "grad_norm": 0.4755648374557495, "learning_rate": 0.0001871345029239766, "loss": 0.3908, "step": 60 }, { "epoch": 3.47, "grad_norm": 0.9289490580558777, "learning_rate": 0.00018362573099415207, "loss": 0.3798, "step": 66 }, { "epoch": 3.79, "grad_norm": 0.4700639247894287, "learning_rate": 0.0001801169590643275, "loss": 0.4379, "step": 72 }, { "epoch": 4.11, "grad_norm": 0.5703785419464111, "learning_rate": 0.00017660818713450294, "loss": 0.3724, "step": 78 }, { "epoch": 4.42, "grad_norm": 0.6487219333648682, "learning_rate": 0.00017309941520467836, "loss": 0.3439, "step": 84 }, { "epoch": 4.74, "grad_norm": 0.599611759185791, "learning_rate": 0.0001695906432748538, "loss": 0.2867, "step": 90 }, { "epoch": 5.05, "grad_norm": 0.5314879417419434, "learning_rate": 0.00016608187134502925, "loss": 0.3, "step": 96 }, { "epoch": 5.37, "grad_norm": 1.1346584558486938, "learning_rate": 0.0001625730994152047, "loss": 0.2441, "step": 102 }, { "epoch": 5.68, "grad_norm": 0.7600080370903015, "learning_rate": 0.00015906432748538012, "loss": 0.2277, "step": 108 }, { "epoch": 6.0, "grad_norm": 0.8896855711936951, "learning_rate": 0.00015555555555555556, "loss": 0.2157, "step": 114 }, { "epoch": 6.32, "grad_norm": 0.7400574684143066, "learning_rate": 0.00015204678362573098, "loss": 0.1671, "step": 120 }, { "epoch": 6.63, "grad_norm": 0.7485764622688293, "learning_rate": 0.00014853801169590643, "loss": 0.2079, "step": 126 }, { "epoch": 6.95, "grad_norm": 0.7054488658905029, "learning_rate": 0.00014502923976608188, "loss": 0.1424, "step": 132 }, { "epoch": 7.26, "grad_norm": 1.1137595176696777, "learning_rate": 0.00014152046783625732, "loss": 0.1406, "step": 138 }, { "epoch": 7.58, "grad_norm": 0.860434889793396, "learning_rate": 0.00013801169590643274, "loss": 0.1158, "step": 144 }, { "epoch": 7.89, "grad_norm": 0.7475857138633728, "learning_rate": 0.0001345029239766082, "loss": 0.11, "step": 150 }, { "epoch": 8.21, "grad_norm": 0.5861940979957581, "learning_rate": 0.00013099415204678364, "loss": 0.0968, "step": 156 }, { "epoch": 8.53, "grad_norm": 0.6981809139251709, "learning_rate": 0.00012748538011695908, "loss": 0.104, "step": 162 }, { "epoch": 8.84, "grad_norm": 0.7109177112579346, "learning_rate": 0.0001239766081871345, "loss": 0.0704, "step": 168 }, { "epoch": 9.16, "grad_norm": 0.4005749523639679, "learning_rate": 0.00012046783625730995, "loss": 0.0757, "step": 174 }, { "epoch": 9.47, "grad_norm": 0.719237744808197, "learning_rate": 0.00011695906432748539, "loss": 0.0697, "step": 180 }, { "epoch": 9.79, "grad_norm": 0.9757436513900757, "learning_rate": 0.00011345029239766083, "loss": 0.0614, "step": 186 }, { "epoch": 10.11, "grad_norm": 0.5613590478897095, "learning_rate": 0.00010994152046783625, "loss": 0.0496, "step": 192 }, { "epoch": 10.42, "grad_norm": 0.49901968240737915, "learning_rate": 0.00010643274853801171, "loss": 0.0476, "step": 198 }, { "epoch": 10.74, "grad_norm": 0.637506902217865, "learning_rate": 0.00010292397660818713, "loss": 0.0442, "step": 204 }, { "epoch": 11.05, "grad_norm": 0.3163486421108246, "learning_rate": 9.941520467836257e-05, "loss": 0.0327, "step": 210 }, { "epoch": 11.37, "grad_norm": 0.34687891602516174, "learning_rate": 9.590643274853801e-05, "loss": 0.0302, "step": 216 }, { "epoch": 11.68, "grad_norm": 0.505370020866394, "learning_rate": 9.239766081871345e-05, "loss": 0.0278, "step": 222 }, { "epoch": 12.0, "grad_norm": 1.4929355382919312, "learning_rate": 8.888888888888889e-05, "loss": 0.0429, "step": 228 }, { "epoch": 12.32, "grad_norm": 0.2789619266986847, "learning_rate": 8.538011695906433e-05, "loss": 0.0213, "step": 234 }, { "epoch": 12.63, "grad_norm": 0.41602373123168945, "learning_rate": 8.187134502923976e-05, "loss": 0.0188, "step": 240 }, { "epoch": 12.95, "grad_norm": 0.38267752528190613, "learning_rate": 7.836257309941521e-05, "loss": 0.027, "step": 246 }, { "epoch": 13.26, "grad_norm": 0.3227517008781433, "learning_rate": 7.485380116959064e-05, "loss": 0.0172, "step": 252 }, { "epoch": 13.58, "grad_norm": 0.5111700892448425, "learning_rate": 7.134502923976609e-05, "loss": 0.0189, "step": 258 }, { "epoch": 13.89, "grad_norm": 0.25930657982826233, "learning_rate": 6.783625730994152e-05, "loss": 0.0157, "step": 264 }, { "epoch": 14.21, "grad_norm": 0.4176621437072754, "learning_rate": 6.432748538011695e-05, "loss": 0.0166, "step": 270 }, { "epoch": 14.53, "grad_norm": 0.45017266273498535, "learning_rate": 6.0818713450292395e-05, "loss": 0.0148, "step": 276 }, { "epoch": 14.84, "grad_norm": 0.2901971638202667, "learning_rate": 5.7309941520467835e-05, "loss": 0.013, "step": 282 }, { "epoch": 15.16, "grad_norm": 0.1628808230161667, "learning_rate": 5.3801169590643275e-05, "loss": 0.0112, "step": 288 }, { "epoch": 15.47, "grad_norm": 0.2727632224559784, "learning_rate": 5.0292397660818715e-05, "loss": 0.011, "step": 294 }, { "epoch": 15.79, "grad_norm": 0.15591496229171753, "learning_rate": 4.678362573099415e-05, "loss": 0.0098, "step": 300 }, { "epoch": 16.11, "grad_norm": 0.15791279077529907, "learning_rate": 4.327485380116959e-05, "loss": 0.013, "step": 306 }, { "epoch": 16.42, "grad_norm": 0.07889483869075775, "learning_rate": 3.976608187134503e-05, "loss": 0.0116, "step": 312 }, { "epoch": 16.74, "grad_norm": 0.12881968915462494, "learning_rate": 3.625730994152047e-05, "loss": 0.0088, "step": 318 }, { "epoch": 17.05, "grad_norm": 0.1373162418603897, "learning_rate": 3.274853801169591e-05, "loss": 0.0078, "step": 324 }, { "epoch": 17.37, "grad_norm": 0.07368919253349304, "learning_rate": 2.9239766081871346e-05, "loss": 0.0075, "step": 330 }, { "epoch": 17.68, "grad_norm": 0.10508895665407181, "learning_rate": 2.5730994152046783e-05, "loss": 0.0085, "step": 336 }, { "epoch": 18.0, "grad_norm": 0.10840031504631042, "learning_rate": 2.2222222222222223e-05, "loss": 0.0079, "step": 342 }, { "epoch": 18.32, "grad_norm": 0.07944358885288239, "learning_rate": 1.871345029239766e-05, "loss": 0.0072, "step": 348 }, { "epoch": 18.63, "grad_norm": 0.10223820805549622, "learning_rate": 1.5204678362573099e-05, "loss": 0.008, "step": 354 }, { "epoch": 18.95, "grad_norm": 0.11620509624481201, "learning_rate": 1.1695906432748537e-05, "loss": 0.0078, "step": 360 }, { "epoch": 19.26, "grad_norm": 0.08788104355335236, "learning_rate": 8.187134502923977e-06, "loss": 0.0083, "step": 366 }, { "epoch": 19.58, "grad_norm": 0.1308579444885254, "learning_rate": 4.678362573099415e-06, "loss": 0.0071, "step": 372 }, { "epoch": 19.89, "grad_norm": 0.09444822371006012, "learning_rate": 1.1695906432748538e-06, "loss": 0.0079, "step": 378 } ], "logging_steps": 6, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 3.007882350034944e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }