| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 380, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.18415243923664093, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 0.9158, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.17151106894016266, | |
| "learning_rate": 6.31578947368421e-05, | |
| "loss": 0.8454, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.253557026386261, | |
| "learning_rate": 9.473684210526316e-05, | |
| "loss": 0.9068, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.44574955105781555, | |
| "learning_rate": 0.0001263157894736842, | |
| "loss": 0.781, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.3264688551425934, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 0.6487, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 0.33505332469940186, | |
| "learning_rate": 0.00018947368421052632, | |
| "loss": 0.6935, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.3825117349624634, | |
| "learning_rate": 0.0001976608187134503, | |
| "loss": 0.606, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 0.31299710273742676, | |
| "learning_rate": 0.00019415204678362573, | |
| "loss": 0.513, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.5375415682792664, | |
| "learning_rate": 0.00019064327485380117, | |
| "loss": 0.5365, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.4755648374557495, | |
| "learning_rate": 0.0001871345029239766, | |
| "loss": 0.3908, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 0.9289490580558777, | |
| "learning_rate": 0.00018362573099415207, | |
| "loss": 0.3798, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.4700639247894287, | |
| "learning_rate": 0.0001801169590643275, | |
| "loss": 0.4379, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 0.5703785419464111, | |
| "learning_rate": 0.00017660818713450294, | |
| "loss": 0.3724, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 0.6487219333648682, | |
| "learning_rate": 0.00017309941520467836, | |
| "loss": 0.3439, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 0.599611759185791, | |
| "learning_rate": 0.0001695906432748538, | |
| "loss": 0.2867, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 0.5314879417419434, | |
| "learning_rate": 0.00016608187134502925, | |
| "loss": 0.3, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "grad_norm": 1.1346584558486938, | |
| "learning_rate": 0.0001625730994152047, | |
| "loss": 0.2441, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.7600080370903015, | |
| "learning_rate": 0.00015906432748538012, | |
| "loss": 0.2277, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.8896855711936951, | |
| "learning_rate": 0.00015555555555555556, | |
| "loss": 0.2157, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.7400574684143066, | |
| "learning_rate": 0.00015204678362573098, | |
| "loss": 0.1671, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 6.63, | |
| "grad_norm": 0.7485764622688293, | |
| "learning_rate": 0.00014853801169590643, | |
| "loss": 0.2079, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 0.7054488658905029, | |
| "learning_rate": 0.00014502923976608188, | |
| "loss": 0.1424, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 7.26, | |
| "grad_norm": 1.1137595176696777, | |
| "learning_rate": 0.00014152046783625732, | |
| "loss": 0.1406, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 7.58, | |
| "grad_norm": 0.860434889793396, | |
| "learning_rate": 0.00013801169590643274, | |
| "loss": 0.1158, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 7.89, | |
| "grad_norm": 0.7475857138633728, | |
| "learning_rate": 0.0001345029239766082, | |
| "loss": 0.11, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 8.21, | |
| "grad_norm": 0.5861940979957581, | |
| "learning_rate": 0.00013099415204678364, | |
| "loss": 0.0968, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 8.53, | |
| "grad_norm": 0.6981809139251709, | |
| "learning_rate": 0.00012748538011695908, | |
| "loss": 0.104, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 8.84, | |
| "grad_norm": 0.7109177112579346, | |
| "learning_rate": 0.0001239766081871345, | |
| "loss": 0.0704, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "grad_norm": 0.4005749523639679, | |
| "learning_rate": 0.00012046783625730995, | |
| "loss": 0.0757, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 9.47, | |
| "grad_norm": 0.719237744808197, | |
| "learning_rate": 0.00011695906432748539, | |
| "loss": 0.0697, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 9.79, | |
| "grad_norm": 0.9757436513900757, | |
| "learning_rate": 0.00011345029239766083, | |
| "loss": 0.0614, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 10.11, | |
| "grad_norm": 0.5613590478897095, | |
| "learning_rate": 0.00010994152046783625, | |
| "loss": 0.0496, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 10.42, | |
| "grad_norm": 0.49901968240737915, | |
| "learning_rate": 0.00010643274853801171, | |
| "loss": 0.0476, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 10.74, | |
| "grad_norm": 0.637506902217865, | |
| "learning_rate": 0.00010292397660818713, | |
| "loss": 0.0442, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 11.05, | |
| "grad_norm": 0.3163486421108246, | |
| "learning_rate": 9.941520467836257e-05, | |
| "loss": 0.0327, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 11.37, | |
| "grad_norm": 0.34687891602516174, | |
| "learning_rate": 9.590643274853801e-05, | |
| "loss": 0.0302, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 11.68, | |
| "grad_norm": 0.505370020866394, | |
| "learning_rate": 9.239766081871345e-05, | |
| "loss": 0.0278, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 1.4929355382919312, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 0.0429, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 12.32, | |
| "grad_norm": 0.2789619266986847, | |
| "learning_rate": 8.538011695906433e-05, | |
| "loss": 0.0213, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 12.63, | |
| "grad_norm": 0.41602373123168945, | |
| "learning_rate": 8.187134502923976e-05, | |
| "loss": 0.0188, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 12.95, | |
| "grad_norm": 0.38267752528190613, | |
| "learning_rate": 7.836257309941521e-05, | |
| "loss": 0.027, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 13.26, | |
| "grad_norm": 0.3227517008781433, | |
| "learning_rate": 7.485380116959064e-05, | |
| "loss": 0.0172, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 13.58, | |
| "grad_norm": 0.5111700892448425, | |
| "learning_rate": 7.134502923976609e-05, | |
| "loss": 0.0189, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 13.89, | |
| "grad_norm": 0.25930657982826233, | |
| "learning_rate": 6.783625730994152e-05, | |
| "loss": 0.0157, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 14.21, | |
| "grad_norm": 0.4176621437072754, | |
| "learning_rate": 6.432748538011695e-05, | |
| "loss": 0.0166, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 14.53, | |
| "grad_norm": 0.45017266273498535, | |
| "learning_rate": 6.0818713450292395e-05, | |
| "loss": 0.0148, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 14.84, | |
| "grad_norm": 0.2901971638202667, | |
| "learning_rate": 5.7309941520467835e-05, | |
| "loss": 0.013, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 15.16, | |
| "grad_norm": 0.1628808230161667, | |
| "learning_rate": 5.3801169590643275e-05, | |
| "loss": 0.0112, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 15.47, | |
| "grad_norm": 0.2727632224559784, | |
| "learning_rate": 5.0292397660818715e-05, | |
| "loss": 0.011, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 15.79, | |
| "grad_norm": 0.15591496229171753, | |
| "learning_rate": 4.678362573099415e-05, | |
| "loss": 0.0098, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 16.11, | |
| "grad_norm": 0.15791279077529907, | |
| "learning_rate": 4.327485380116959e-05, | |
| "loss": 0.013, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 16.42, | |
| "grad_norm": 0.07889483869075775, | |
| "learning_rate": 3.976608187134503e-05, | |
| "loss": 0.0116, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 16.74, | |
| "grad_norm": 0.12881968915462494, | |
| "learning_rate": 3.625730994152047e-05, | |
| "loss": 0.0088, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 17.05, | |
| "grad_norm": 0.1373162418603897, | |
| "learning_rate": 3.274853801169591e-05, | |
| "loss": 0.0078, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 17.37, | |
| "grad_norm": 0.07368919253349304, | |
| "learning_rate": 2.9239766081871346e-05, | |
| "loss": 0.0075, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 17.68, | |
| "grad_norm": 0.10508895665407181, | |
| "learning_rate": 2.5730994152046783e-05, | |
| "loss": 0.0085, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.10840031504631042, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.0079, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 18.32, | |
| "grad_norm": 0.07944358885288239, | |
| "learning_rate": 1.871345029239766e-05, | |
| "loss": 0.0072, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 18.63, | |
| "grad_norm": 0.10223820805549622, | |
| "learning_rate": 1.5204678362573099e-05, | |
| "loss": 0.008, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 18.95, | |
| "grad_norm": 0.11620509624481201, | |
| "learning_rate": 1.1695906432748537e-05, | |
| "loss": 0.0078, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 19.26, | |
| "grad_norm": 0.08788104355335236, | |
| "learning_rate": 8.187134502923977e-06, | |
| "loss": 0.0083, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 19.58, | |
| "grad_norm": 0.1308579444885254, | |
| "learning_rate": 4.678362573099415e-06, | |
| "loss": 0.0071, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 19.89, | |
| "grad_norm": 0.09444822371006012, | |
| "learning_rate": 1.1695906432748538e-06, | |
| "loss": 0.0079, | |
| "step": 378 | |
| } | |
| ], | |
| "logging_steps": 6, | |
| "max_steps": 380, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "total_flos": 3.007882350034944e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |