| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.06337938902268982, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006337938902268982, |
| "grad_norm": 19.0, |
| "learning_rate": 3.4285714285714284e-05, |
| "loss": 2.3561, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0012675877804537963, |
| "grad_norm": 12.3125, |
| "learning_rate": 7.714285714285713e-05, |
| "loss": 2.0686, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0019013816706806947, |
| "grad_norm": 6.78125, |
| "learning_rate": 0.00011999999999999999, |
| "loss": 1.412, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0025351755609075927, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00016285714285714284, |
| "loss": 0.8966, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.003168969451134491, |
| "grad_norm": 1.8671875, |
| "learning_rate": 0.0002057142857142857, |
| "loss": 0.7659, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0038027633413613895, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00024857142857142857, |
| "loss": 0.7462, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0044365572315882874, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.0002914285714285714, |
| "loss": 0.7137, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.005070351121815185, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0002999999985714883, |
| "loss": 0.7019, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005704145012042084, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0002999999927681599, |
| "loss": 0.682, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.006337938902268982, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00029999998250073286, |
| "loss": 0.6785, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00697173279249588, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00029999996776920767, |
| "loss": 0.6718, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007605526682722779, |
| "grad_norm": 2.640625, |
| "learning_rate": 0.0002999999485735849, |
| "loss": 0.6704, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.008239320572949677, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0002999999249138654, |
| "loss": 0.6495, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.008873114463176575, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00029999989679004995, |
| "loss": 0.652, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.009506908353403473, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00029999986420213977, |
| "loss": 0.6583, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.01014070224363037, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.0002999998271501361, |
| "loss": 0.6612, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.01077449613385727, |
| "grad_norm": 1.625, |
| "learning_rate": 0.0002999997856340405, |
| "loss": 0.6436, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.011408290024084168, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00029999973965385445, |
| "loss": 0.6532, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.012042083914311066, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00029999968920958, |
| "loss": 0.6426, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.012675877804537964, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.000299999634301219, |
| "loss": 0.6507, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.013309671694764862, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0002999995749287736, |
| "loss": 0.6404, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.01394346558499176, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00029999951109224623, |
| "loss": 0.6312, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.014577259475218658, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0002999994427916394, |
| "loss": 0.6227, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.015211053365445558, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.0002999993700269559, |
| "loss": 0.6377, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.015844847255672454, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0002999992927981985, |
| "loss": 0.6294, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.016478641145899354, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0002999992111053703, |
| "loss": 0.6333, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.01711243503612625, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.0002999991249484746, |
| "loss": 0.6157, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.01774622892635315, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.0002999990343275147, |
| "loss": 0.6145, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01838002281658005, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00029999893924249427, |
| "loss": 0.6091, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.019013816706806946, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00029999883969341717, |
| "loss": 0.6149, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.019647610597033845, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.0002999987356802872, |
| "loss": 0.6171, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.02028140448726074, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00029999862720310857, |
| "loss": 0.6021, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.02091519837748764, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0002999985142618855, |
| "loss": 0.6303, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.02154899226771454, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.0002999983968566226, |
| "loss": 0.6139, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.022182786157941437, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.0002999982749873244, |
| "loss": 0.6173, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.022816580048168337, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00029999814865399585, |
| "loss": 0.6074, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.023450373938395233, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00029999801785664194, |
| "loss": 0.6046, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.024084167828622133, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00029999788259526774, |
| "loss": 0.5923, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.02471796171884903, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00029999774286987876, |
| "loss": 0.6149, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.02535175560907593, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.0002999975986804805, |
| "loss": 0.5969, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02598554949930283, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0002999974500270787, |
| "loss": 0.6083, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.026619343389529725, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00029999729690967917, |
| "loss": 0.6029, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.027253137279756624, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.0002999971393282881, |
| "loss": 0.6064, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.02788693116998352, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0002999969772829116, |
| "loss": 0.6102, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.02852072506021042, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00029999681077355633, |
| "loss": 0.5899, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.029154518950437316, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00029999663980022866, |
| "loss": 0.595, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.029788312840664216, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00029999646436293555, |
| "loss": 0.5848, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.030422106730891116, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00029999628446168383, |
| "loss": 0.6003, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.031055900621118012, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0002999961000964807, |
| "loss": 0.5875, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.03168969451134491, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00029999591126733346, |
| "loss": 0.5769, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03232348840157181, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00029999571797424966, |
| "loss": 0.5928, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.03295728229179871, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.0002999955202172369, |
| "loss": 0.5767, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.03359107618202561, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.000299995317996303, |
| "loss": 0.5763, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.0342248700722525, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.0002999951113114561, |
| "loss": 0.5852, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.0348586639624794, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0002999949001627043, |
| "loss": 0.5965, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0354924578527063, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00029999468455005606, |
| "loss": 0.5828, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.0361262517429332, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00029999446447351985, |
| "loss": 0.5918, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.0367600456331601, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00029999423993310446, |
| "loss": 0.5853, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.03739383952338699, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00029999401092881876, |
| "loss": 0.5706, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.03802763341361389, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.0002999937774606719, |
| "loss": 0.571, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03866142730384079, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00029999353952867307, |
| "loss": 0.5731, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03929522119406769, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.0002999932971328317, |
| "loss": 0.5806, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.03992901508429459, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0002999930502731575, |
| "loss": 0.5751, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.04056280897452148, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0002999927989496603, |
| "loss": 0.5777, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.04119660286474838, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.0002999925431623499, |
| "loss": 0.5856, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.04183039675497528, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00029999228291123655, |
| "loss": 0.5754, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.04246419064520218, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.0002999920181963306, |
| "loss": 0.5684, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.04309798453542908, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0002999917490176425, |
| "loss": 0.5665, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.043731778425655975, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00029999147537518295, |
| "loss": 0.5713, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.044365572315882874, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.0002999911972689628, |
| "loss": 0.5675, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.044999366206109774, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.0002999909146989931, |
| "loss": 0.575, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.045633160096336674, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0002999906276652851, |
| "loss": 0.5721, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.046266953986563567, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00029999033616785015, |
| "loss": 0.5653, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.046900747876790466, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0002999900402066998, |
| "loss": 0.5543, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.047534541767017366, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.0002999897397818458, |
| "loss": 0.5643, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.048168335657244266, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.0002999894348933001, |
| "loss": 0.574, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.048802129547471165, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.0002999891255410748, |
| "loss": 0.5581, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04943592343769806, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0002999888117251821, |
| "loss": 0.558, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.05006971732792496, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00029998849344563456, |
| "loss": 0.5614, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.05070351121815186, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00029998817070244475, |
| "loss": 0.5692, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.05133730510837876, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00029998784349562544, |
| "loss": 0.5552, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.05197109899860566, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.0002999875118251897, |
| "loss": 0.5549, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.05260489288883255, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00029998717569115063, |
| "loss": 0.5574, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.05323868677905945, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0002999868350935216, |
| "loss": 0.5587, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.05387248066928635, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.00029998649003231613, |
| "loss": 0.5611, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05450627455951325, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.0002999861405075479, |
| "loss": 0.5567, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.05514006844974014, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0002999857865192308, |
| "loss": 0.5525, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.05577386233996704, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00029998542806737875, |
| "loss": 0.5595, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05640765623019394, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00029998506515200614, |
| "loss": 0.5534, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05704145012042084, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0002999846977731273, |
| "loss": 0.548, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05767524401064774, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00029998432593075677, |
| "loss": 0.5534, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.05830903790087463, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00029998394962490945, |
| "loss": 0.5593, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05894283179110153, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.0002999835688556001, |
| "loss": 0.5591, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05957662568132843, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0002999831836228439, |
| "loss": 0.5674, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.06021041957155533, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00029998279392665615, |
| "loss": 0.534, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.06084421346178223, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0002999823997670523, |
| "loss": 0.5403, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.061478007352009124, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00029998200114404803, |
| "loss": 0.542, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.062111801242236024, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00029998159805765906, |
| "loss": 0.5432, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.06274559513246292, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00029998119050790145, |
| "loss": 0.5462, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.06337938902268982, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00029998077849479135, |
| "loss": 0.5406, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06337938902268982, |
| "eval_loss": 0.5448389053344727, |
| "eval_runtime": 2.5375, |
| "eval_samples_per_second": 78.818, |
| "eval_steps_per_second": 78.818, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 78890, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2537940672512e+16, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|