{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06337938902268982, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006337938902268982, "grad_norm": 19.0, "learning_rate": 3.4285714285714284e-05, "loss": 2.3561, "step": 5 }, { "epoch": 0.0012675877804537963, "grad_norm": 12.3125, "learning_rate": 7.714285714285713e-05, "loss": 2.0686, "step": 10 }, { "epoch": 0.0019013816706806947, "grad_norm": 6.78125, "learning_rate": 0.00011999999999999999, "loss": 1.412, "step": 15 }, { "epoch": 0.0025351755609075927, "grad_norm": 2.296875, "learning_rate": 0.00016285714285714284, "loss": 0.8966, "step": 20 }, { "epoch": 0.003168969451134491, "grad_norm": 1.8671875, "learning_rate": 0.0002057142857142857, "loss": 0.7659, "step": 25 }, { "epoch": 0.0038027633413613895, "grad_norm": 2.046875, "learning_rate": 0.00024857142857142857, "loss": 0.7462, "step": 30 }, { "epoch": 0.0044365572315882874, "grad_norm": 1.640625, "learning_rate": 0.0002914285714285714, "loss": 0.7137, "step": 35 }, { "epoch": 0.005070351121815185, "grad_norm": 1.4765625, "learning_rate": 0.0002999999985714883, "loss": 0.7019, "step": 40 }, { "epoch": 0.005704145012042084, "grad_norm": 2.234375, "learning_rate": 0.0002999999927681599, "loss": 0.682, "step": 45 }, { "epoch": 0.006337938902268982, "grad_norm": 1.8984375, "learning_rate": 0.00029999998250073286, "loss": 0.6785, "step": 50 }, { "epoch": 0.00697173279249588, "grad_norm": 1.84375, "learning_rate": 0.00029999996776920767, "loss": 0.6718, "step": 55 }, { "epoch": 0.007605526682722779, "grad_norm": 2.640625, "learning_rate": 0.0002999999485735849, "loss": 0.6704, "step": 60 }, { "epoch": 0.008239320572949677, "grad_norm": 1.4765625, "learning_rate": 0.0002999999249138654, "loss": 0.6495, "step": 65 }, { "epoch": 0.008873114463176575, "grad_norm": 1.6328125, "learning_rate": 0.00029999989679004995, "loss": 0.652, "step": 70 }, { "epoch": 0.009506908353403473, "grad_norm": 1.6171875, "learning_rate": 0.00029999986420213977, "loss": 0.6583, "step": 75 }, { "epoch": 0.01014070224363037, "grad_norm": 1.671875, "learning_rate": 0.0002999998271501361, "loss": 0.6612, "step": 80 }, { "epoch": 0.01077449613385727, "grad_norm": 1.625, "learning_rate": 0.0002999997856340405, "loss": 0.6436, "step": 85 }, { "epoch": 0.011408290024084168, "grad_norm": 1.390625, "learning_rate": 0.00029999973965385445, "loss": 0.6532, "step": 90 }, { "epoch": 0.012042083914311066, "grad_norm": 1.4609375, "learning_rate": 0.00029999968920958, "loss": 0.6426, "step": 95 }, { "epoch": 0.012675877804537964, "grad_norm": 1.921875, "learning_rate": 0.000299999634301219, "loss": 0.6507, "step": 100 }, { "epoch": 0.013309671694764862, "grad_norm": 1.265625, "learning_rate": 0.0002999995749287736, "loss": 0.6404, "step": 105 }, { "epoch": 0.01394346558499176, "grad_norm": 1.390625, "learning_rate": 0.00029999951109224623, "loss": 0.6312, "step": 110 }, { "epoch": 0.014577259475218658, "grad_norm": 1.453125, "learning_rate": 0.0002999994427916394, "loss": 0.6227, "step": 115 }, { "epoch": 0.015211053365445558, "grad_norm": 1.6015625, "learning_rate": 0.0002999993700269559, "loss": 0.6377, "step": 120 }, { "epoch": 0.015844847255672454, "grad_norm": 1.4296875, "learning_rate": 0.0002999992927981985, "loss": 0.6294, "step": 125 }, { "epoch": 0.016478641145899354, "grad_norm": 1.484375, "learning_rate": 0.0002999992111053703, "loss": 0.6333, "step": 130 }, { "epoch": 0.01711243503612625, "grad_norm": 1.484375, "learning_rate": 0.0002999991249484746, "loss": 0.6157, "step": 135 }, { "epoch": 0.01774622892635315, "grad_norm": 2.203125, "learning_rate": 0.0002999990343275147, "loss": 0.6145, "step": 140 }, { "epoch": 0.01838002281658005, "grad_norm": 1.2734375, "learning_rate": 0.00029999893924249427, "loss": 0.6091, "step": 145 }, { "epoch": 0.019013816706806946, "grad_norm": 1.296875, "learning_rate": 0.00029999883969341717, "loss": 0.6149, "step": 150 }, { "epoch": 0.019647610597033845, "grad_norm": 1.6328125, "learning_rate": 0.0002999987356802872, "loss": 0.6171, "step": 155 }, { "epoch": 0.02028140448726074, "grad_norm": 1.3984375, "learning_rate": 0.00029999862720310857, "loss": 0.6021, "step": 160 }, { "epoch": 0.02091519837748764, "grad_norm": 1.546875, "learning_rate": 0.0002999985142618855, "loss": 0.6303, "step": 165 }, { "epoch": 0.02154899226771454, "grad_norm": 1.6953125, "learning_rate": 0.0002999983968566226, "loss": 0.6139, "step": 170 }, { "epoch": 0.022182786157941437, "grad_norm": 1.9765625, "learning_rate": 0.0002999982749873244, "loss": 0.6173, "step": 175 }, { "epoch": 0.022816580048168337, "grad_norm": 1.3515625, "learning_rate": 0.00029999814865399585, "loss": 0.6074, "step": 180 }, { "epoch": 0.023450373938395233, "grad_norm": 1.3828125, "learning_rate": 0.00029999801785664194, "loss": 0.6046, "step": 185 }, { "epoch": 0.024084167828622133, "grad_norm": 1.28125, "learning_rate": 0.00029999788259526774, "loss": 0.5923, "step": 190 }, { "epoch": 0.02471796171884903, "grad_norm": 1.265625, "learning_rate": 0.00029999774286987876, "loss": 0.6149, "step": 195 }, { "epoch": 0.02535175560907593, "grad_norm": 1.8984375, "learning_rate": 0.0002999975986804805, "loss": 0.5969, "step": 200 }, { "epoch": 0.02598554949930283, "grad_norm": 1.4609375, "learning_rate": 0.0002999974500270787, "loss": 0.6083, "step": 205 }, { "epoch": 0.026619343389529725, "grad_norm": 1.4609375, "learning_rate": 0.00029999729690967917, "loss": 0.6029, "step": 210 }, { "epoch": 0.027253137279756624, "grad_norm": 1.515625, "learning_rate": 0.0002999971393282881, "loss": 0.6064, "step": 215 }, { "epoch": 0.02788693116998352, "grad_norm": 1.2734375, "learning_rate": 0.0002999969772829116, "loss": 0.6102, "step": 220 }, { "epoch": 0.02852072506021042, "grad_norm": 1.3984375, "learning_rate": 0.00029999681077355633, "loss": 0.5899, "step": 225 }, { "epoch": 0.029154518950437316, "grad_norm": 1.4765625, "learning_rate": 0.00029999663980022866, "loss": 0.595, "step": 230 }, { "epoch": 0.029788312840664216, "grad_norm": 1.5078125, "learning_rate": 0.00029999646436293555, "loss": 0.5848, "step": 235 }, { "epoch": 0.030422106730891116, "grad_norm": 1.46875, "learning_rate": 0.00029999628446168383, "loss": 0.6003, "step": 240 }, { "epoch": 0.031055900621118012, "grad_norm": 1.34375, "learning_rate": 0.0002999961000964807, "loss": 0.5875, "step": 245 }, { "epoch": 0.03168969451134491, "grad_norm": 1.5078125, "learning_rate": 0.00029999591126733346, "loss": 0.5769, "step": 250 }, { "epoch": 0.03232348840157181, "grad_norm": 1.328125, "learning_rate": 0.00029999571797424966, "loss": 0.5928, "step": 255 }, { "epoch": 0.03295728229179871, "grad_norm": 1.578125, "learning_rate": 0.0002999955202172369, "loss": 0.5767, "step": 260 }, { "epoch": 0.03359107618202561, "grad_norm": 1.515625, "learning_rate": 0.000299995317996303, "loss": 0.5763, "step": 265 }, { "epoch": 0.0342248700722525, "grad_norm": 1.5078125, "learning_rate": 0.0002999951113114561, "loss": 0.5852, "step": 270 }, { "epoch": 0.0348586639624794, "grad_norm": 1.8125, "learning_rate": 0.0002999949001627043, "loss": 0.5965, "step": 275 }, { "epoch": 0.0354924578527063, "grad_norm": 1.5703125, "learning_rate": 0.00029999468455005606, "loss": 0.5828, "step": 280 }, { "epoch": 0.0361262517429332, "grad_norm": 1.5, "learning_rate": 0.00029999446447351985, "loss": 0.5918, "step": 285 }, { "epoch": 0.0367600456331601, "grad_norm": 1.3828125, "learning_rate": 0.00029999423993310446, "loss": 0.5853, "step": 290 }, { "epoch": 0.03739383952338699, "grad_norm": 1.703125, "learning_rate": 0.00029999401092881876, "loss": 0.5706, "step": 295 }, { "epoch": 0.03802763341361389, "grad_norm": 1.5859375, "learning_rate": 0.0002999937774606719, "loss": 0.571, "step": 300 }, { "epoch": 0.03866142730384079, "grad_norm": 1.578125, "learning_rate": 0.00029999353952867307, "loss": 0.5731, "step": 305 }, { "epoch": 0.03929522119406769, "grad_norm": 1.578125, "learning_rate": 0.0002999932971328317, "loss": 0.5806, "step": 310 }, { "epoch": 0.03992901508429459, "grad_norm": 1.2734375, "learning_rate": 0.0002999930502731575, "loss": 0.5751, "step": 315 }, { "epoch": 0.04056280897452148, "grad_norm": 1.265625, "learning_rate": 0.0002999927989496603, "loss": 0.5777, "step": 320 }, { "epoch": 0.04119660286474838, "grad_norm": 1.5390625, "learning_rate": 0.0002999925431623499, "loss": 0.5856, "step": 325 }, { "epoch": 0.04183039675497528, "grad_norm": 1.46875, "learning_rate": 0.00029999228291123655, "loss": 0.5754, "step": 330 }, { "epoch": 0.04246419064520218, "grad_norm": 1.4921875, "learning_rate": 0.0002999920181963306, "loss": 0.5684, "step": 335 }, { "epoch": 0.04309798453542908, "grad_norm": 1.5703125, "learning_rate": 0.0002999917490176425, "loss": 0.5665, "step": 340 }, { "epoch": 0.043731778425655975, "grad_norm": 1.328125, "learning_rate": 0.00029999147537518295, "loss": 0.5713, "step": 345 }, { "epoch": 0.044365572315882874, "grad_norm": 1.6875, "learning_rate": 0.0002999911972689628, "loss": 0.5675, "step": 350 }, { "epoch": 0.044999366206109774, "grad_norm": 1.59375, "learning_rate": 0.0002999909146989931, "loss": 0.575, "step": 355 }, { "epoch": 0.045633160096336674, "grad_norm": 1.25, "learning_rate": 0.0002999906276652851, "loss": 0.5721, "step": 360 }, { "epoch": 0.046266953986563567, "grad_norm": 1.8828125, "learning_rate": 0.00029999033616785015, "loss": 0.5653, "step": 365 }, { "epoch": 0.046900747876790466, "grad_norm": 1.8125, "learning_rate": 0.0002999900402066998, "loss": 0.5543, "step": 370 }, { "epoch": 0.047534541767017366, "grad_norm": 1.53125, "learning_rate": 0.0002999897397818458, "loss": 0.5643, "step": 375 }, { "epoch": 0.048168335657244266, "grad_norm": 1.78125, "learning_rate": 0.0002999894348933001, "loss": 0.574, "step": 380 }, { "epoch": 0.048802129547471165, "grad_norm": 1.46875, "learning_rate": 0.0002999891255410748, "loss": 0.5581, "step": 385 }, { "epoch": 0.04943592343769806, "grad_norm": 1.4609375, "learning_rate": 0.0002999888117251821, "loss": 0.558, "step": 390 }, { "epoch": 0.05006971732792496, "grad_norm": 1.3671875, "learning_rate": 0.00029998849344563456, "loss": 0.5614, "step": 395 }, { "epoch": 0.05070351121815186, "grad_norm": 1.4609375, "learning_rate": 0.00029998817070244475, "loss": 0.5692, "step": 400 }, { "epoch": 0.05133730510837876, "grad_norm": 1.359375, "learning_rate": 0.00029998784349562544, "loss": 0.5552, "step": 405 }, { "epoch": 0.05197109899860566, "grad_norm": 1.59375, "learning_rate": 0.0002999875118251897, "loss": 0.5549, "step": 410 }, { "epoch": 0.05260489288883255, "grad_norm": 1.484375, "learning_rate": 0.00029998717569115063, "loss": 0.5574, "step": 415 }, { "epoch": 0.05323868677905945, "grad_norm": 1.4765625, "learning_rate": 0.0002999868350935216, "loss": 0.5587, "step": 420 }, { "epoch": 0.05387248066928635, "grad_norm": 1.9765625, "learning_rate": 0.00029998649003231613, "loss": 0.5611, "step": 425 }, { "epoch": 0.05450627455951325, "grad_norm": 1.21875, "learning_rate": 0.0002999861405075479, "loss": 0.5567, "step": 430 }, { "epoch": 0.05514006844974014, "grad_norm": 1.15625, "learning_rate": 0.0002999857865192308, "loss": 0.5525, "step": 435 }, { "epoch": 0.05577386233996704, "grad_norm": 1.8046875, "learning_rate": 0.00029998542806737875, "loss": 0.5595, "step": 440 }, { "epoch": 0.05640765623019394, "grad_norm": 1.4609375, "learning_rate": 0.00029998506515200614, "loss": 0.5534, "step": 445 }, { "epoch": 0.05704145012042084, "grad_norm": 1.453125, "learning_rate": 0.0002999846977731273, "loss": 0.548, "step": 450 }, { "epoch": 0.05767524401064774, "grad_norm": 1.546875, "learning_rate": 0.00029998432593075677, "loss": 0.5534, "step": 455 }, { "epoch": 0.05830903790087463, "grad_norm": 1.390625, "learning_rate": 0.00029998394962490945, "loss": 0.5593, "step": 460 }, { "epoch": 0.05894283179110153, "grad_norm": 1.3359375, "learning_rate": 0.0002999835688556001, "loss": 0.5591, "step": 465 }, { "epoch": 0.05957662568132843, "grad_norm": 1.4609375, "learning_rate": 0.0002999831836228439, "loss": 0.5674, "step": 470 }, { "epoch": 0.06021041957155533, "grad_norm": 1.2734375, "learning_rate": 0.00029998279392665615, "loss": 0.534, "step": 475 }, { "epoch": 0.06084421346178223, "grad_norm": 1.453125, "learning_rate": 0.0002999823997670523, "loss": 0.5403, "step": 480 }, { "epoch": 0.061478007352009124, "grad_norm": 1.3515625, "learning_rate": 0.00029998200114404803, "loss": 0.542, "step": 485 }, { "epoch": 0.062111801242236024, "grad_norm": 1.5, "learning_rate": 0.00029998159805765906, "loss": 0.5432, "step": 490 }, { "epoch": 0.06274559513246292, "grad_norm": 1.4453125, "learning_rate": 0.00029998119050790145, "loss": 0.5462, "step": 495 }, { "epoch": 0.06337938902268982, "grad_norm": 1.4765625, "learning_rate": 0.00029998077849479135, "loss": 0.5406, "step": 500 }, { "epoch": 0.06337938902268982, "eval_loss": 0.5448389053344727, "eval_runtime": 2.5375, "eval_samples_per_second": 78.818, "eval_steps_per_second": 78.818, "step": 500 } ], "logging_steps": 5, "max_steps": 78890, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2537940672512e+16, "train_batch_size": 48, "trial_name": null, "trial_params": null }