| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 451, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011098779134295227, | |
| "grad_norm": 1.020947813987732, | |
| "learning_rate": 1.0619469026548673e-06, | |
| "loss": 1.2866, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.022197558268590455, | |
| "grad_norm": 1.0182945728302002, | |
| "learning_rate": 2.389380530973451e-06, | |
| "loss": 1.242, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.033296337402885685, | |
| "grad_norm": 0.6542662382125854, | |
| "learning_rate": 3.7168141592920353e-06, | |
| "loss": 1.2928, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04439511653718091, | |
| "grad_norm": 0.588236391544342, | |
| "learning_rate": 5.04424778761062e-06, | |
| "loss": 1.242, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05549389567147614, | |
| "grad_norm": 0.579189121723175, | |
| "learning_rate": 6.371681415929204e-06, | |
| "loss": 1.222, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06659267480577137, | |
| "grad_norm": 0.516274094581604, | |
| "learning_rate": 7.699115044247788e-06, | |
| "loss": 1.1735, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07769145394006659, | |
| "grad_norm": 0.5116409659385681, | |
| "learning_rate": 9.026548672566373e-06, | |
| "loss": 1.2008, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08879023307436182, | |
| "grad_norm": 0.566781222820282, | |
| "learning_rate": 1.0353982300884955e-05, | |
| "loss": 1.206, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09988901220865705, | |
| "grad_norm": 0.48452886939048767, | |
| "learning_rate": 1.168141592920354e-05, | |
| "loss": 1.177, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11098779134295228, | |
| "grad_norm": 0.4358895719051361, | |
| "learning_rate": 1.3008849557522125e-05, | |
| "loss": 1.1648, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1220865704772475, | |
| "grad_norm": 0.5665825605392456, | |
| "learning_rate": 1.433628318584071e-05, | |
| "loss": 1.1316, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13318534961154274, | |
| "grad_norm": 0.481657475233078, | |
| "learning_rate": 1.5663716814159295e-05, | |
| "loss": 1.1461, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14428412874583796, | |
| "grad_norm": 0.5076817274093628, | |
| "learning_rate": 1.6991150442477876e-05, | |
| "loss": 1.1245, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15538290788013318, | |
| "grad_norm": 0.4932589530944824, | |
| "learning_rate": 1.8318584070796458e-05, | |
| "loss": 1.1608, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16648168701442842, | |
| "grad_norm": 0.488295316696167, | |
| "learning_rate": 1.9646017699115046e-05, | |
| "loss": 1.1016, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17758046614872364, | |
| "grad_norm": 0.48246464133262634, | |
| "learning_rate": 2.0973451327433628e-05, | |
| "loss": 1.1259, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18867924528301888, | |
| "grad_norm": 0.48559442162513733, | |
| "learning_rate": 2.2300884955752213e-05, | |
| "loss": 1.0816, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1997780244173141, | |
| "grad_norm": 0.4469965398311615, | |
| "learning_rate": 2.3628318584070798e-05, | |
| "loss": 1.1392, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21087680355160932, | |
| "grad_norm": 0.5929376482963562, | |
| "learning_rate": 2.495575221238938e-05, | |
| "loss": 1.1227, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22197558268590456, | |
| "grad_norm": 0.42551663517951965, | |
| "learning_rate": 2.6283185840707968e-05, | |
| "loss": 1.14, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23307436182019978, | |
| "grad_norm": 0.5455872416496277, | |
| "learning_rate": 2.761061946902655e-05, | |
| "loss": 1.0756, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.244173140954495, | |
| "grad_norm": 0.5717443823814392, | |
| "learning_rate": 2.893805309734513e-05, | |
| "loss": 1.0195, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.25527192008879024, | |
| "grad_norm": 0.5069417953491211, | |
| "learning_rate": 2.999998386674501e-05, | |
| "loss": 1.0218, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2663706992230855, | |
| "grad_norm": 0.6419563293457031, | |
| "learning_rate": 2.999941920646437e-05, | |
| "loss": 1.0718, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.27746947835738067, | |
| "grad_norm": 0.5803301930427551, | |
| "learning_rate": 2.9998047918138238e-05, | |
| "loss": 1.0622, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2885682574916759, | |
| "grad_norm": 0.5580577850341797, | |
| "learning_rate": 2.9995870075510795e-05, | |
| "loss": 1.0556, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29966703662597116, | |
| "grad_norm": 0.518118679523468, | |
| "learning_rate": 2.9992885795700492e-05, | |
| "loss": 1.0543, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.31076581576026635, | |
| "grad_norm": 0.7111470699310303, | |
| "learning_rate": 2.9989095239193793e-05, | |
| "loss": 1.0242, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3218645948945616, | |
| "grad_norm": 0.5507873296737671, | |
| "learning_rate": 2.998449860983655e-05, | |
| "loss": 0.9969, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.33296337402885684, | |
| "grad_norm": 0.5861643552780151, | |
| "learning_rate": 2.9979096154823008e-05, | |
| "loss": 0.9685, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.34406215316315203, | |
| "grad_norm": 0.5630763173103333, | |
| "learning_rate": 2.997288816468252e-05, | |
| "loss": 1.0296, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3551609322974473, | |
| "grad_norm": 0.6153825521469116, | |
| "learning_rate": 2.9965874973263953e-05, | |
| "loss": 0.9819, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3662597114317425, | |
| "grad_norm": 0.6237099170684814, | |
| "learning_rate": 2.99580569577177e-05, | |
| "loss": 1.0431, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 0.6736854314804077, | |
| "learning_rate": 2.9949434538475414e-05, | |
| "loss": 0.9211, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.38845726970033295, | |
| "grad_norm": 0.7440330386161804, | |
| "learning_rate": 2.9940008179227398e-05, | |
| "loss": 0.9114, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3995560488346282, | |
| "grad_norm": 0.7021297812461853, | |
| "learning_rate": 2.9929778386897653e-05, | |
| "loss": 0.959, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.41065482796892344, | |
| "grad_norm": 0.6943163871765137, | |
| "learning_rate": 2.9918745711616647e-05, | |
| "loss": 0.9401, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.42175360710321863, | |
| "grad_norm": 0.5902981758117676, | |
| "learning_rate": 2.99069107466917e-05, | |
| "loss": 0.9254, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4328523862375139, | |
| "grad_norm": 0.7625190615653992, | |
| "learning_rate": 2.9894274128575104e-05, | |
| "loss": 1.0178, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4439511653718091, | |
| "grad_norm": 0.9426499009132385, | |
| "learning_rate": 2.9880836536829867e-05, | |
| "loss": 0.9321, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4550499445061043, | |
| "grad_norm": 0.6531440019607544, | |
| "learning_rate": 2.9866598694093204e-05, | |
| "loss": 0.9645, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.46614872364039955, | |
| "grad_norm": 0.7278143167495728, | |
| "learning_rate": 2.985156136603764e-05, | |
| "loss": 0.8816, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4772475027746948, | |
| "grad_norm": 0.6720770001411438, | |
| "learning_rate": 2.983572536132985e-05, | |
| "loss": 0.9271, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.48834628190899, | |
| "grad_norm": 0.7253017425537109, | |
| "learning_rate": 2.9819091531587184e-05, | |
| "loss": 0.92, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.49944506104328523, | |
| "grad_norm": 0.7579580545425415, | |
| "learning_rate": 2.9801660771331855e-05, | |
| "loss": 0.9163, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5105438401775805, | |
| "grad_norm": 0.7624631524085999, | |
| "learning_rate": 2.9783434017942814e-05, | |
| "loss": 0.8707, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5216426193118757, | |
| "grad_norm": 0.8809816837310791, | |
| "learning_rate": 2.9764412251605386e-05, | |
| "loss": 0.8942, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.532741398446171, | |
| "grad_norm": 0.8497355580329895, | |
| "learning_rate": 2.974459649525853e-05, | |
| "loss": 0.9032, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5438401775804661, | |
| "grad_norm": 0.771935224533081, | |
| "learning_rate": 2.9723987814539818e-05, | |
| "loss": 0.8595, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5549389567147613, | |
| "grad_norm": 0.7510399222373962, | |
| "learning_rate": 2.9702587317728157e-05, | |
| "loss": 0.8882, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5660377358490566, | |
| "grad_norm": 0.7542237043380737, | |
| "learning_rate": 2.968039615568417e-05, | |
| "loss": 0.8684, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5771365149833518, | |
| "grad_norm": 0.8170939087867737, | |
| "learning_rate": 2.965741552178831e-05, | |
| "loss": 0.869, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.7694569826126099, | |
| "learning_rate": 2.9633646651876678e-05, | |
| "loss": 0.8339, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5993340732519423, | |
| "grad_norm": 0.8420746922492981, | |
| "learning_rate": 2.960909082417457e-05, | |
| "loss": 0.8849, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6104328523862376, | |
| "grad_norm": 0.90952068567276, | |
| "learning_rate": 2.9583749359227744e-05, | |
| "loss": 0.8696, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6215316315205327, | |
| "grad_norm": 0.9462668299674988, | |
| "learning_rate": 2.9557623619831385e-05, | |
| "loss": 0.8323, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.632630410654828, | |
| "grad_norm": 0.8263661861419678, | |
| "learning_rate": 2.9530715010956828e-05, | |
| "loss": 0.8225, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6437291897891232, | |
| "grad_norm": 0.9636655449867249, | |
| "learning_rate": 2.950302497967602e-05, | |
| "loss": 0.8116, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6548279689234184, | |
| "grad_norm": 0.792352020740509, | |
| "learning_rate": 2.9474555015083665e-05, | |
| "loss": 0.8045, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6659267480577137, | |
| "grad_norm": 0.8013424873352051, | |
| "learning_rate": 2.944530664821717e-05, | |
| "loss": 0.7529, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6770255271920089, | |
| "grad_norm": 0.8466354608535767, | |
| "learning_rate": 2.9415281451974305e-05, | |
| "loss": 0.7782, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6881243063263041, | |
| "grad_norm": 0.8911815285682678, | |
| "learning_rate": 2.938448104102862e-05, | |
| "loss": 0.7746, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6992230854605993, | |
| "grad_norm": 0.9244343638420105, | |
| "learning_rate": 2.935290707174259e-05, | |
| "loss": 0.7962, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7103218645948945, | |
| "grad_norm": 0.9099295735359192, | |
| "learning_rate": 2.9320561242078572e-05, | |
| "loss": 0.7738, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7214206437291898, | |
| "grad_norm": 0.8415182828903198, | |
| "learning_rate": 2.928744529150748e-05, | |
| "loss": 0.7366, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.732519422863485, | |
| "grad_norm": 0.9733411073684692, | |
| "learning_rate": 2.9253561000915226e-05, | |
| "loss": 0.7519, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7436182019977803, | |
| "grad_norm": 0.8663511872291565, | |
| "learning_rate": 2.9218910192506977e-05, | |
| "loss": 0.7463, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 0.8752774000167847, | |
| "learning_rate": 2.918349472970914e-05, | |
| "loss": 0.7216, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7658157602663707, | |
| "grad_norm": 0.898760974407196, | |
| "learning_rate": 2.9147316517069164e-05, | |
| "loss": 0.805, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7769145394006659, | |
| "grad_norm": 0.8114638328552246, | |
| "learning_rate": 2.9110377500153103e-05, | |
| "loss": 0.7402, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7880133185349611, | |
| "grad_norm": 0.9255645871162415, | |
| "learning_rate": 2.9072679665441016e-05, | |
| "loss": 0.7046, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7991120976692564, | |
| "grad_norm": 0.8243072628974915, | |
| "learning_rate": 2.9034225040220105e-05, | |
| "loss": 0.7653, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8102108768035516, | |
| "grad_norm": 0.8807098269462585, | |
| "learning_rate": 2.899501569247573e-05, | |
| "loss": 0.7149, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8213096559378469, | |
| "grad_norm": 0.8475015163421631, | |
| "learning_rate": 2.895505373078016e-05, | |
| "loss": 0.6763, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.832408435072142, | |
| "grad_norm": 0.8343256711959839, | |
| "learning_rate": 2.891434130417922e-05, | |
| "loss": 0.7348, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8435072142064373, | |
| "grad_norm": 0.9187164306640625, | |
| "learning_rate": 2.8872880602076675e-05, | |
| "loss": 0.7555, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8546059933407325, | |
| "grad_norm": 0.955761194229126, | |
| "learning_rate": 2.8830673854116543e-05, | |
| "loss": 0.6992, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8657047724750278, | |
| "grad_norm": 0.9569207429885864, | |
| "learning_rate": 2.878772333006314e-05, | |
| "loss": 0.6657, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.876803551609323, | |
| "grad_norm": 0.9878470301628113, | |
| "learning_rate": 2.874403133967906e-05, | |
| "loss": 0.6982, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8879023307436182, | |
| "grad_norm": 1.106117844581604, | |
| "learning_rate": 2.869960023260092e-05, | |
| "loss": 0.6987, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8990011098779135, | |
| "grad_norm": 0.9092835187911987, | |
| "learning_rate": 2.865443239821305e-05, | |
| "loss": 0.6365, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9100998890122086, | |
| "grad_norm": 1.0136357545852661, | |
| "learning_rate": 2.860853026551896e-05, | |
| "loss": 0.681, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9211986681465039, | |
| "grad_norm": 0.9235714673995972, | |
| "learning_rate": 2.8561896303010737e-05, | |
| "loss": 0.6689, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9322974472807991, | |
| "grad_norm": 0.8986316919326782, | |
| "learning_rate": 2.8514533018536286e-05, | |
| "loss": 0.6377, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9433962264150944, | |
| "grad_norm": 1.089402675628662, | |
| "learning_rate": 2.8466442959164467e-05, | |
| "loss": 0.652, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9544950055493896, | |
| "grad_norm": 0.9313281774520874, | |
| "learning_rate": 2.841762871104812e-05, | |
| "loss": 0.6775, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9655937846836848, | |
| "grad_norm": 0.9396001100540161, | |
| "learning_rate": 2.8368092899284992e-05, | |
| "loss": 0.6707, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.97669256381798, | |
| "grad_norm": 0.8905186057090759, | |
| "learning_rate": 2.8317838187776565e-05, | |
| "loss": 0.6125, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9877913429522752, | |
| "grad_norm": 0.9526168704032898, | |
| "learning_rate": 2.826686727908479e-05, | |
| "loss": 0.6681, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9988901220865705, | |
| "grad_norm": 1.046419620513916, | |
| "learning_rate": 2.8215182914286768e-05, | |
| "loss": 0.6802, | |
| "step": 450 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2255, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.499013219183493e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |