{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 451, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011098779134295227, "grad_norm": 1.020947813987732, "learning_rate": 1.0619469026548673e-06, "loss": 1.2866, "step": 5 }, { "epoch": 0.022197558268590455, "grad_norm": 1.0182945728302002, "learning_rate": 2.389380530973451e-06, "loss": 1.242, "step": 10 }, { "epoch": 0.033296337402885685, "grad_norm": 0.6542662382125854, "learning_rate": 3.7168141592920353e-06, "loss": 1.2928, "step": 15 }, { "epoch": 0.04439511653718091, "grad_norm": 0.588236391544342, "learning_rate": 5.04424778761062e-06, "loss": 1.242, "step": 20 }, { "epoch": 0.05549389567147614, "grad_norm": 0.579189121723175, "learning_rate": 6.371681415929204e-06, "loss": 1.222, "step": 25 }, { "epoch": 0.06659267480577137, "grad_norm": 0.516274094581604, "learning_rate": 7.699115044247788e-06, "loss": 1.1735, "step": 30 }, { "epoch": 0.07769145394006659, "grad_norm": 0.5116409659385681, "learning_rate": 9.026548672566373e-06, "loss": 1.2008, "step": 35 }, { "epoch": 0.08879023307436182, "grad_norm": 0.566781222820282, "learning_rate": 1.0353982300884955e-05, "loss": 1.206, "step": 40 }, { "epoch": 0.09988901220865705, "grad_norm": 0.48452886939048767, "learning_rate": 1.168141592920354e-05, "loss": 1.177, "step": 45 }, { "epoch": 0.11098779134295228, "grad_norm": 0.4358895719051361, "learning_rate": 1.3008849557522125e-05, "loss": 1.1648, "step": 50 }, { "epoch": 0.1220865704772475, "grad_norm": 0.5665825605392456, "learning_rate": 1.433628318584071e-05, "loss": 1.1316, "step": 55 }, { "epoch": 0.13318534961154274, "grad_norm": 0.481657475233078, "learning_rate": 1.5663716814159295e-05, "loss": 1.1461, "step": 60 }, { "epoch": 0.14428412874583796, "grad_norm": 0.5076817274093628, "learning_rate": 1.6991150442477876e-05, "loss": 1.1245, "step": 65 }, { "epoch": 0.15538290788013318, "grad_norm": 0.4932589530944824, "learning_rate": 1.8318584070796458e-05, "loss": 1.1608, "step": 70 }, { "epoch": 0.16648168701442842, "grad_norm": 0.488295316696167, "learning_rate": 1.9646017699115046e-05, "loss": 1.1016, "step": 75 }, { "epoch": 0.17758046614872364, "grad_norm": 0.48246464133262634, "learning_rate": 2.0973451327433628e-05, "loss": 1.1259, "step": 80 }, { "epoch": 0.18867924528301888, "grad_norm": 0.48559442162513733, "learning_rate": 2.2300884955752213e-05, "loss": 1.0816, "step": 85 }, { "epoch": 0.1997780244173141, "grad_norm": 0.4469965398311615, "learning_rate": 2.3628318584070798e-05, "loss": 1.1392, "step": 90 }, { "epoch": 0.21087680355160932, "grad_norm": 0.5929376482963562, "learning_rate": 2.495575221238938e-05, "loss": 1.1227, "step": 95 }, { "epoch": 0.22197558268590456, "grad_norm": 0.42551663517951965, "learning_rate": 2.6283185840707968e-05, "loss": 1.14, "step": 100 }, { "epoch": 0.23307436182019978, "grad_norm": 0.5455872416496277, "learning_rate": 2.761061946902655e-05, "loss": 1.0756, "step": 105 }, { "epoch": 0.244173140954495, "grad_norm": 0.5717443823814392, "learning_rate": 2.893805309734513e-05, "loss": 1.0195, "step": 110 }, { "epoch": 0.25527192008879024, "grad_norm": 0.5069417953491211, "learning_rate": 2.999998386674501e-05, "loss": 1.0218, "step": 115 }, { "epoch": 0.2663706992230855, "grad_norm": 0.6419563293457031, "learning_rate": 2.999941920646437e-05, "loss": 1.0718, "step": 120 }, { "epoch": 0.27746947835738067, "grad_norm": 0.5803301930427551, "learning_rate": 2.9998047918138238e-05, "loss": 1.0622, "step": 125 }, { "epoch": 0.2885682574916759, "grad_norm": 0.5580577850341797, "learning_rate": 2.9995870075510795e-05, "loss": 1.0556, "step": 130 }, { "epoch": 0.29966703662597116, "grad_norm": 0.518118679523468, "learning_rate": 2.9992885795700492e-05, "loss": 1.0543, "step": 135 }, { "epoch": 0.31076581576026635, "grad_norm": 0.7111470699310303, "learning_rate": 2.9989095239193793e-05, "loss": 1.0242, "step": 140 }, { "epoch": 0.3218645948945616, "grad_norm": 0.5507873296737671, "learning_rate": 2.998449860983655e-05, "loss": 0.9969, "step": 145 }, { "epoch": 0.33296337402885684, "grad_norm": 0.5861643552780151, "learning_rate": 2.9979096154823008e-05, "loss": 0.9685, "step": 150 }, { "epoch": 0.34406215316315203, "grad_norm": 0.5630763173103333, "learning_rate": 2.997288816468252e-05, "loss": 1.0296, "step": 155 }, { "epoch": 0.3551609322974473, "grad_norm": 0.6153825521469116, "learning_rate": 2.9965874973263953e-05, "loss": 0.9819, "step": 160 }, { "epoch": 0.3662597114317425, "grad_norm": 0.6237099170684814, "learning_rate": 2.99580569577177e-05, "loss": 1.0431, "step": 165 }, { "epoch": 0.37735849056603776, "grad_norm": 0.6736854314804077, "learning_rate": 2.9949434538475414e-05, "loss": 0.9211, "step": 170 }, { "epoch": 0.38845726970033295, "grad_norm": 0.7440330386161804, "learning_rate": 2.9940008179227398e-05, "loss": 0.9114, "step": 175 }, { "epoch": 0.3995560488346282, "grad_norm": 0.7021297812461853, "learning_rate": 2.9929778386897653e-05, "loss": 0.959, "step": 180 }, { "epoch": 0.41065482796892344, "grad_norm": 0.6943163871765137, "learning_rate": 2.9918745711616647e-05, "loss": 0.9401, "step": 185 }, { "epoch": 0.42175360710321863, "grad_norm": 0.5902981758117676, "learning_rate": 2.99069107466917e-05, "loss": 0.9254, "step": 190 }, { "epoch": 0.4328523862375139, "grad_norm": 0.7625190615653992, "learning_rate": 2.9894274128575104e-05, "loss": 1.0178, "step": 195 }, { "epoch": 0.4439511653718091, "grad_norm": 0.9426499009132385, "learning_rate": 2.9880836536829867e-05, "loss": 0.9321, "step": 200 }, { "epoch": 0.4550499445061043, "grad_norm": 0.6531440019607544, "learning_rate": 2.9866598694093204e-05, "loss": 0.9645, "step": 205 }, { "epoch": 0.46614872364039955, "grad_norm": 0.7278143167495728, "learning_rate": 2.985156136603764e-05, "loss": 0.8816, "step": 210 }, { "epoch": 0.4772475027746948, "grad_norm": 0.6720770001411438, "learning_rate": 2.983572536132985e-05, "loss": 0.9271, "step": 215 }, { "epoch": 0.48834628190899, "grad_norm": 0.7253017425537109, "learning_rate": 2.9819091531587184e-05, "loss": 0.92, "step": 220 }, { "epoch": 0.49944506104328523, "grad_norm": 0.7579580545425415, "learning_rate": 2.9801660771331855e-05, "loss": 0.9163, "step": 225 }, { "epoch": 0.5105438401775805, "grad_norm": 0.7624631524085999, "learning_rate": 2.9783434017942814e-05, "loss": 0.8707, "step": 230 }, { "epoch": 0.5216426193118757, "grad_norm": 0.8809816837310791, "learning_rate": 2.9764412251605386e-05, "loss": 0.8942, "step": 235 }, { "epoch": 0.532741398446171, "grad_norm": 0.8497355580329895, "learning_rate": 2.974459649525853e-05, "loss": 0.9032, "step": 240 }, { "epoch": 0.5438401775804661, "grad_norm": 0.771935224533081, "learning_rate": 2.9723987814539818e-05, "loss": 0.8595, "step": 245 }, { "epoch": 0.5549389567147613, "grad_norm": 0.7510399222373962, "learning_rate": 2.9702587317728157e-05, "loss": 0.8882, "step": 250 }, { "epoch": 0.5660377358490566, "grad_norm": 0.7542237043380737, "learning_rate": 2.968039615568417e-05, "loss": 0.8684, "step": 255 }, { "epoch": 0.5771365149833518, "grad_norm": 0.8170939087867737, "learning_rate": 2.965741552178831e-05, "loss": 0.869, "step": 260 }, { "epoch": 0.5882352941176471, "grad_norm": 0.7694569826126099, "learning_rate": 2.9633646651876678e-05, "loss": 0.8339, "step": 265 }, { "epoch": 0.5993340732519423, "grad_norm": 0.8420746922492981, "learning_rate": 2.960909082417457e-05, "loss": 0.8849, "step": 270 }, { "epoch": 0.6104328523862376, "grad_norm": 0.90952068567276, "learning_rate": 2.9583749359227744e-05, "loss": 0.8696, "step": 275 }, { "epoch": 0.6215316315205327, "grad_norm": 0.9462668299674988, "learning_rate": 2.9557623619831385e-05, "loss": 0.8323, "step": 280 }, { "epoch": 0.632630410654828, "grad_norm": 0.8263661861419678, "learning_rate": 2.9530715010956828e-05, "loss": 0.8225, "step": 285 }, { "epoch": 0.6437291897891232, "grad_norm": 0.9636655449867249, "learning_rate": 2.950302497967602e-05, "loss": 0.8116, "step": 290 }, { "epoch": 0.6548279689234184, "grad_norm": 0.792352020740509, "learning_rate": 2.9474555015083665e-05, "loss": 0.8045, "step": 295 }, { "epoch": 0.6659267480577137, "grad_norm": 0.8013424873352051, "learning_rate": 2.944530664821717e-05, "loss": 0.7529, "step": 300 }, { "epoch": 0.6770255271920089, "grad_norm": 0.8466354608535767, "learning_rate": 2.9415281451974305e-05, "loss": 0.7782, "step": 305 }, { "epoch": 0.6881243063263041, "grad_norm": 0.8911815285682678, "learning_rate": 2.938448104102862e-05, "loss": 0.7746, "step": 310 }, { "epoch": 0.6992230854605993, "grad_norm": 0.9244343638420105, "learning_rate": 2.935290707174259e-05, "loss": 0.7962, "step": 315 }, { "epoch": 0.7103218645948945, "grad_norm": 0.9099295735359192, "learning_rate": 2.9320561242078572e-05, "loss": 0.7738, "step": 320 }, { "epoch": 0.7214206437291898, "grad_norm": 0.8415182828903198, "learning_rate": 2.928744529150748e-05, "loss": 0.7366, "step": 325 }, { "epoch": 0.732519422863485, "grad_norm": 0.9733411073684692, "learning_rate": 2.9253561000915226e-05, "loss": 0.7519, "step": 330 }, { "epoch": 0.7436182019977803, "grad_norm": 0.8663511872291565, "learning_rate": 2.9218910192506977e-05, "loss": 0.7463, "step": 335 }, { "epoch": 0.7547169811320755, "grad_norm": 0.8752774000167847, "learning_rate": 2.918349472970914e-05, "loss": 0.7216, "step": 340 }, { "epoch": 0.7658157602663707, "grad_norm": 0.898760974407196, "learning_rate": 2.9147316517069164e-05, "loss": 0.805, "step": 345 }, { "epoch": 0.7769145394006659, "grad_norm": 0.8114638328552246, "learning_rate": 2.9110377500153103e-05, "loss": 0.7402, "step": 350 }, { "epoch": 0.7880133185349611, "grad_norm": 0.9255645871162415, "learning_rate": 2.9072679665441016e-05, "loss": 0.7046, "step": 355 }, { "epoch": 0.7991120976692564, "grad_norm": 0.8243072628974915, "learning_rate": 2.9034225040220105e-05, "loss": 0.7653, "step": 360 }, { "epoch": 0.8102108768035516, "grad_norm": 0.8807098269462585, "learning_rate": 2.899501569247573e-05, "loss": 0.7149, "step": 365 }, { "epoch": 0.8213096559378469, "grad_norm": 0.8475015163421631, "learning_rate": 2.895505373078016e-05, "loss": 0.6763, "step": 370 }, { "epoch": 0.832408435072142, "grad_norm": 0.8343256711959839, "learning_rate": 2.891434130417922e-05, "loss": 0.7348, "step": 375 }, { "epoch": 0.8435072142064373, "grad_norm": 0.9187164306640625, "learning_rate": 2.8872880602076675e-05, "loss": 0.7555, "step": 380 }, { "epoch": 0.8546059933407325, "grad_norm": 0.955761194229126, "learning_rate": 2.8830673854116543e-05, "loss": 0.6992, "step": 385 }, { "epoch": 0.8657047724750278, "grad_norm": 0.9569207429885864, "learning_rate": 2.878772333006314e-05, "loss": 0.6657, "step": 390 }, { "epoch": 0.876803551609323, "grad_norm": 0.9878470301628113, "learning_rate": 2.874403133967906e-05, "loss": 0.6982, "step": 395 }, { "epoch": 0.8879023307436182, "grad_norm": 1.106117844581604, "learning_rate": 2.869960023260092e-05, "loss": 0.6987, "step": 400 }, { "epoch": 0.8990011098779135, "grad_norm": 0.9092835187911987, "learning_rate": 2.865443239821305e-05, "loss": 0.6365, "step": 405 }, { "epoch": 0.9100998890122086, "grad_norm": 1.0136357545852661, "learning_rate": 2.860853026551896e-05, "loss": 0.681, "step": 410 }, { "epoch": 0.9211986681465039, "grad_norm": 0.9235714673995972, "learning_rate": 2.8561896303010737e-05, "loss": 0.6689, "step": 415 }, { "epoch": 0.9322974472807991, "grad_norm": 0.8986316919326782, "learning_rate": 2.8514533018536286e-05, "loss": 0.6377, "step": 420 }, { "epoch": 0.9433962264150944, "grad_norm": 1.089402675628662, "learning_rate": 2.8466442959164467e-05, "loss": 0.652, "step": 425 }, { "epoch": 0.9544950055493896, "grad_norm": 0.9313281774520874, "learning_rate": 2.841762871104812e-05, "loss": 0.6775, "step": 430 }, { "epoch": 0.9655937846836848, "grad_norm": 0.9396001100540161, "learning_rate": 2.8368092899284992e-05, "loss": 0.6707, "step": 435 }, { "epoch": 0.97669256381798, "grad_norm": 0.8905186057090759, "learning_rate": 2.8317838187776565e-05, "loss": 0.6125, "step": 440 }, { "epoch": 0.9877913429522752, "grad_norm": 0.9526168704032898, "learning_rate": 2.826686727908479e-05, "loss": 0.6681, "step": 445 }, { "epoch": 0.9988901220865705, "grad_norm": 1.046419620513916, "learning_rate": 2.8215182914286768e-05, "loss": 0.6802, "step": 450 } ], "logging_steps": 5, "max_steps": 2255, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.499013219183493e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }