{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07399459839431721, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014798919678863443, "grad_norm": 3.318990468978882, "learning_rate": 3.6e-05, "loss": 0.8225, "step": 10 }, { "epoch": 0.0029597839357726886, "grad_norm": 1.644492745399475, "learning_rate": 7.6e-05, "loss": 0.2495, "step": 20 }, { "epoch": 0.004439675903659033, "grad_norm": 2.137453556060791, "learning_rate": 9.998250366089848e-05, "loss": 0.2076, "step": 30 }, { "epoch": 0.005919567871545377, "grad_norm": 1.6013213396072388, "learning_rate": 9.97858104436822e-05, "loss": 0.1753, "step": 40 }, { "epoch": 0.007399459839431722, "grad_norm": 2.4884767532348633, "learning_rate": 9.937141654477528e-05, "loss": 0.1661, "step": 50 }, { "epoch": 0.008879351807318065, "grad_norm": 1.5080770254135132, "learning_rate": 9.87411340032603e-05, "loss": 0.1414, "step": 60 }, { "epoch": 0.01035924377520441, "grad_norm": 1.3873870372772217, "learning_rate": 9.789771888432375e-05, "loss": 0.1409, "step": 70 }, { "epoch": 0.011839135743090754, "grad_norm": 1.1532950401306152, "learning_rate": 9.684485922768422e-05, "loss": 0.1169, "step": 80 }, { "epoch": 0.013319027710977099, "grad_norm": 1.088762640953064, "learning_rate": 9.558715892073323e-05, "loss": 0.1043, "step": 90 }, { "epoch": 0.014798919678863444, "grad_norm": 2.323737621307373, "learning_rate": 9.413011756690685e-05, "loss": 0.116, "step": 100 }, { "epoch": 0.016278811646749786, "grad_norm": 1.2114522457122803, "learning_rate": 9.248010643731935e-05, "loss": 0.1146, "step": 110 }, { "epoch": 0.01775870361463613, "grad_norm": 1.2140398025512695, "learning_rate": 9.064434061081562e-05, "loss": 0.1248, "step": 120 }, { "epoch": 0.019238595582522475, "grad_norm": 1.2787519693374634, "learning_rate": 8.863084742426719e-05, "loss": 0.0848, "step": 130 }, { "epoch": 0.02071848755040882, "grad_norm": 0.8212881684303284, "learning_rate": 8.644843137107059e-05, "loss": 0.0965, "step": 140 }, { "epoch": 0.022198379518295164, "grad_norm": 1.0979036092758179, "learning_rate": 8.410663560133784e-05, "loss": 0.0922, "step": 150 }, { "epoch": 0.02367827148618151, "grad_norm": 0.9978598356246948, "learning_rate": 8.161570019212921e-05, "loss": 0.0916, "step": 160 }, { "epoch": 0.025158163454067854, "grad_norm": 0.8716151714324951, "learning_rate": 7.898651737020166e-05, "loss": 0.067, "step": 170 }, { "epoch": 0.026638055421954198, "grad_norm": 1.0746519565582275, "learning_rate": 7.623058388307269e-05, "loss": 0.0802, "step": 180 }, { "epoch": 0.028117947389840543, "grad_norm": 1.3861006498336792, "learning_rate": 7.335995072666848e-05, "loss": 0.0907, "step": 190 }, { "epoch": 0.029597839357726887, "grad_norm": 1.093328833580017, "learning_rate": 7.038717044938519e-05, "loss": 0.0657, "step": 200 }, { "epoch": 0.03107773132561323, "grad_norm": 0.8141024708747864, "learning_rate": 6.732524226298841e-05, "loss": 0.0613, "step": 210 }, { "epoch": 0.03255762329349957, "grad_norm": 0.9468361139297485, "learning_rate": 6.418755520036775e-05, "loss": 0.0829, "step": 220 }, { "epoch": 0.03403751526138592, "grad_norm": 0.6965128183364868, "learning_rate": 6.0987829568702656e-05, "loss": 0.0796, "step": 230 }, { "epoch": 0.03551740722927226, "grad_norm": 1.127314567565918, "learning_rate": 5.7740056954050084e-05, "loss": 0.0798, "step": 240 }, { "epoch": 0.036997299197158606, "grad_norm": 1.4155848026275635, "learning_rate": 5.445843903969854e-05, "loss": 0.06, "step": 250 }, { "epoch": 0.03847719116504495, "grad_norm": 0.7054800987243652, "learning_rate": 5.1157325505820694e-05, "loss": 0.052, "step": 260 }, { "epoch": 0.039957083132931295, "grad_norm": 0.7660062909126282, "learning_rate": 4.785115128197298e-05, "loss": 0.059, "step": 270 }, { "epoch": 0.04143697510081764, "grad_norm": 0.7886701822280884, "learning_rate": 4.4554373426821374e-05, "loss": 0.0536, "step": 280 }, { "epoch": 0.042916867068703984, "grad_norm": 1.4434950351715088, "learning_rate": 4.1281407911102425e-05, "loss": 0.0628, "step": 290 }, { "epoch": 0.04439675903659033, "grad_norm": 0.9001641869544983, "learning_rate": 3.8046566580251e-05, "loss": 0.0536, "step": 300 }, { "epoch": 0.04587665100447667, "grad_norm": 0.7539929151535034, "learning_rate": 3.4863994572341843e-05, "loss": 0.0673, "step": 310 }, { "epoch": 0.04735654297236302, "grad_norm": 0.6108214855194092, "learning_rate": 3.1747608464999725e-05, "loss": 0.0358, "step": 320 }, { "epoch": 0.04883643494024936, "grad_norm": 0.4837505519390106, "learning_rate": 2.8711035421746367e-05, "loss": 0.041, "step": 330 }, { "epoch": 0.05031632690813571, "grad_norm": 0.8078829050064087, "learning_rate": 2.5767553603881767e-05, "loss": 0.0518, "step": 340 }, { "epoch": 0.05179621887602205, "grad_norm": 1.5803087949752808, "learning_rate": 2.29300341084631e-05, "loss": 0.0486, "step": 350 }, { "epoch": 0.053276110843908396, "grad_norm": 0.3863455355167389, "learning_rate": 2.0210884686272368e-05, "loss": 0.039, "step": 360 }, { "epoch": 0.05475600281179474, "grad_norm": 0.7085531949996948, "learning_rate": 1.7621995485879062e-05, "loss": 0.0389, "step": 370 }, { "epoch": 0.056235894779681085, "grad_norm": 0.7141103744506836, "learning_rate": 1.517468706104589e-05, "loss": 0.0425, "step": 380 }, { "epoch": 0.05771578674756743, "grad_norm": 0.6287118196487427, "learning_rate": 1.2879660868827508e-05, "loss": 0.0457, "step": 390 }, { "epoch": 0.059195678715453774, "grad_norm": 0.821704089641571, "learning_rate": 1.0746952474821614e-05, "loss": 0.0436, "step": 400 }, { "epoch": 0.06067557068334012, "grad_norm": 0.6595329642295837, "learning_rate": 8.785887670194138e-06, "loss": 0.0436, "step": 410 }, { "epoch": 0.06215546265122646, "grad_norm": 0.5961970090866089, "learning_rate": 7.005041692367154e-06, "loss": 0.0309, "step": 420 }, { "epoch": 0.06363535461911281, "grad_norm": 0.578628659248352, "learning_rate": 5.412201727687644e-06, "loss": 0.0402, "step": 430 }, { "epoch": 0.06511524658699915, "grad_norm": 0.48573747277259827, "learning_rate": 4.01433286004283e-06, "loss": 0.0338, "step": 440 }, { "epoch": 0.0665951385548855, "grad_norm": 0.54139244556427, "learning_rate": 2.817547614320615e-06, "loss": 0.0269, "step": 450 }, { "epoch": 0.06807503052277183, "grad_norm": 0.6104289293289185, "learning_rate": 1.8270792278934302e-06, "loss": 0.0328, "step": 460 }, { "epoch": 0.06955492249065819, "grad_norm": 0.40234851837158203, "learning_rate": 1.0472587670027678e-06, "loss": 0.0341, "step": 470 }, { "epoch": 0.07103481445854452, "grad_norm": 0.3963511288166046, "learning_rate": 4.814961881085045e-07, "loss": 0.0347, "step": 480 }, { "epoch": 0.07251470642643087, "grad_norm": 0.36650583148002625, "learning_rate": 1.3226542701689215e-07, "loss": 0.0401, "step": 490 }, { "epoch": 0.07399459839431721, "grad_norm": 0.48519328236579895, "learning_rate": 1.0935809887702154e-09, "loss": 0.0292, "step": 500 }, { "epoch": 0.07399459839431721, "step": 500, "total_flos": 0.0, "train_loss": 0.09187581622600556, "train_runtime": 831.2312, "train_samples_per_second": 4.812, "train_steps_per_second": 0.602 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }