| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 543, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0055248618784530384, |
| "grad_norm": 7.4375, |
| "learning_rate": 3.6363636363636366e-07, |
| "loss": 2.4042, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.027624309392265192, |
| "grad_norm": 7.59375, |
| "learning_rate": 1.8181818181818183e-06, |
| "loss": 2.4209, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.055248618784530384, |
| "grad_norm": 7.1875, |
| "learning_rate": 3.6363636363636366e-06, |
| "loss": 2.4119, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08287292817679558, |
| "grad_norm": 7.03125, |
| "learning_rate": 5.4545454545454545e-06, |
| "loss": 2.3657, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.11049723756906077, |
| "grad_norm": 7.28125, |
| "learning_rate": 7.272727272727273e-06, |
| "loss": 2.3123, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.13812154696132597, |
| "grad_norm": 5.65625, |
| "learning_rate": 9.090909090909091e-06, |
| "loss": 2.1773, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.16574585635359115, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.0909090909090909e-05, |
| "loss": 2.0066, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.19337016574585636, |
| "grad_norm": 3.0, |
| "learning_rate": 1.2727272727272728e-05, |
| "loss": 1.9028, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.22099447513812154, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.4545454545454546e-05, |
| "loss": 1.7204, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.24861878453038674, |
| "grad_norm": 2.109375, |
| "learning_rate": 1.6363636363636366e-05, |
| "loss": 1.5881, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.27624309392265195, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 1.4278, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.30386740331491713, |
| "grad_norm": 1.71875, |
| "learning_rate": 2e-05, |
| "loss": 1.2964, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3314917127071823, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.9994819965926346e-05, |
| "loss": 1.1979, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.35911602209944754, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.997928523025598e-05, |
| "loss": 1.1192, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3867403314917127, |
| "grad_norm": 0.76171875, |
| "learning_rate": 1.9953411887080917e-05, |
| "loss": 1.0823, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.4143646408839779, |
| "grad_norm": 0.70703125, |
| "learning_rate": 1.9917226741361014e-05, |
| "loss": 1.0383, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4419889502762431, |
| "grad_norm": 0.69921875, |
| "learning_rate": 1.987076728115383e-05, |
| "loss": 1.0168, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4696132596685083, |
| "grad_norm": 0.68359375, |
| "learning_rate": 1.9814081638776743e-05, |
| "loss": 1.004, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4972375690607735, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.9747228540941555e-05, |
| "loss": 0.993, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5248618784530387, |
| "grad_norm": 0.765625, |
| "learning_rate": 1.9670277247913205e-05, |
| "loss": 0.9644, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5524861878453039, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.958330748175568e-05, |
| "loss": 0.996, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.580110497237569, |
| "grad_norm": 0.734375, |
| "learning_rate": 1.948640934373939e-05, |
| "loss": 0.9704, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.6077348066298343, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.9379683220995657e-05, |
| "loss": 0.9572, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.6353591160220995, |
| "grad_norm": 0.86328125, |
| "learning_rate": 1.9263239682514953e-05, |
| "loss": 0.9553, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6629834254143646, |
| "grad_norm": 0.75, |
| "learning_rate": 1.9137199364596673e-05, |
| "loss": 0.9454, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6906077348066298, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.9001692845869113e-05, |
| "loss": 0.939, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7182320441988951, |
| "grad_norm": 0.77734375, |
| "learning_rate": 1.8856860512009115e-05, |
| "loss": 0.9433, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.7458563535911602, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.8702852410301556e-05, |
| "loss": 0.9329, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7734806629834254, |
| "grad_norm": 0.71484375, |
| "learning_rate": 1.853982809418932e-05, |
| "loss": 0.9416, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.8011049723756906, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.8367956457974872e-05, |
| "loss": 0.914, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.8287292817679558, |
| "grad_norm": 0.87890625, |
| "learning_rate": 1.8187415561844586e-05, |
| "loss": 0.9229, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.856353591160221, |
| "grad_norm": 0.828125, |
| "learning_rate": 1.7998392447397197e-05, |
| "loss": 0.9259, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8839779005524862, |
| "grad_norm": 0.85546875, |
| "learning_rate": 1.7801082943867406e-05, |
| "loss": 0.9421, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9116022099447514, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.7595691465245484e-05, |
| "loss": 0.9225, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.9392265193370166, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.7382430798502977e-05, |
| "loss": 0.9066, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.9668508287292817, |
| "grad_norm": 0.83984375, |
| "learning_rate": 1.7161521883143936e-05, |
| "loss": 0.8903, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.994475138121547, |
| "grad_norm": 0.875, |
| "learning_rate": 1.693319358231011e-05, |
| "loss": 0.9252, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.022099447513812, |
| "grad_norm": 0.8203125, |
| "learning_rate": 1.6697682445677158e-05, |
| "loss": 0.9035, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.0497237569060773, |
| "grad_norm": 0.69140625, |
| "learning_rate": 1.6455232464387587e-05, |
| "loss": 0.9036, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.0773480662983426, |
| "grad_norm": 0.71875, |
| "learning_rate": 1.6206094818274228e-05, |
| "loss": 0.8932, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.1049723756906078, |
| "grad_norm": 0.91796875, |
| "learning_rate": 1.595052761563627e-05, |
| "loss": 0.9065, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.132596685082873, |
| "grad_norm": 0.859375, |
| "learning_rate": 1.5688795625837274e-05, |
| "loss": 0.8995, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.160220994475138, |
| "grad_norm": 0.78125, |
| "learning_rate": 1.542117000500229e-05, |
| "loss": 0.8844, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.1878453038674033, |
| "grad_norm": 0.765625, |
| "learning_rate": 1.5147928015098309e-05, |
| "loss": 0.8894, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.2154696132596685, |
| "grad_norm": 0.81640625, |
| "learning_rate": 1.4869352736688938e-05, |
| "loss": 0.894, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.2430939226519337, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.458573277566103e-05, |
| "loss": 0.9222, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.270718232044199, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.4297361964227004e-05, |
| "loss": 0.9014, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.298342541436464, |
| "grad_norm": 0.8125, |
| "learning_rate": 1.4004539056512667e-05, |
| "loss": 0.9052, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.3259668508287292, |
| "grad_norm": 0.87109375, |
| "learning_rate": 1.3707567419045926e-05, |
| "loss": 0.894, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3535911602209945, |
| "grad_norm": 0.8125, |
| "learning_rate": 1.3406754716466978e-05, |
| "loss": 0.9045, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.3812154696132597, |
| "grad_norm": 0.78515625, |
| "learning_rate": 1.3102412592785654e-05, |
| "loss": 0.8737, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.408839779005525, |
| "grad_norm": 0.7734375, |
| "learning_rate": 1.2794856348516095e-05, |
| "loss": 0.9029, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.43646408839779, |
| "grad_norm": 0.83984375, |
| "learning_rate": 1.248440461402328e-05, |
| "loss": 0.8883, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4640883977900552, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.2171379019419786e-05, |
| "loss": 0.8932, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.4917127071823204, |
| "grad_norm": 0.890625, |
| "learning_rate": 1.1856103861354809e-05, |
| "loss": 0.8917, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.5193370165745856, |
| "grad_norm": 0.78515625, |
| "learning_rate": 1.153890576704062e-05, |
| "loss": 0.9033, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.5469613259668509, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.1220113355864549e-05, |
| "loss": 0.8839, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.5745856353591159, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.0900056898937055e-05, |
| "loss": 0.8887, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.6022099447513813, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.0579067976928614e-05, |
| "loss": 0.8951, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.6298342541436464, |
| "grad_norm": 0.78125, |
| "learning_rate": 1.0257479136549889e-05, |
| "loss": 0.8954, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.6574585635359116, |
| "grad_norm": 0.8359375, |
| "learning_rate": 9.935623546031043e-06, |
| "loss": 0.9004, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6850828729281768, |
| "grad_norm": 0.85546875, |
| "learning_rate": 9.613834649957216e-06, |
| "loss": 0.9045, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.7127071823204418, |
| "grad_norm": 0.765625, |
| "learning_rate": 9.292445823817647e-06, |
| "loss": 0.8737, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.7403314917127073, |
| "grad_norm": 0.8203125, |
| "learning_rate": 8.971790028626395e-06, |
| "loss": 0.8722, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.7679558011049723, |
| "grad_norm": 0.921875, |
| "learning_rate": 8.652199465972462e-06, |
| "loss": 0.8995, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7955801104972375, |
| "grad_norm": 0.8203125, |
| "learning_rate": 8.334005233856681e-06, |
| "loss": 0.9114, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.8232044198895028, |
| "grad_norm": 0.79296875, |
| "learning_rate": 8.017536983671929e-06, |
| "loss": 0.891, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.850828729281768, |
| "grad_norm": 0.75, |
| "learning_rate": 7.703122578682047e-06, |
| "loss": 0.8875, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.8784530386740332, |
| "grad_norm": 0.8125, |
| "learning_rate": 7.391087754353252e-06, |
| "loss": 0.8779, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.9060773480662982, |
| "grad_norm": 0.76953125, |
| "learning_rate": 7.081755780889978e-06, |
| "loss": 0.885, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.9337016574585635, |
| "grad_norm": 0.828125, |
| "learning_rate": 6.7754471283247594e-06, |
| "loss": 0.8875, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.9613259668508287, |
| "grad_norm": 0.9140625, |
| "learning_rate": 6.472479134509052e-06, |
| "loss": 0.9037, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.988950276243094, |
| "grad_norm": 0.83203125, |
| "learning_rate": 6.173165676349103e-06, |
| "loss": 0.8817, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.016574585635359, |
| "grad_norm": 0.81640625, |
| "learning_rate": 5.8778168446273045e-06, |
| "loss": 0.8876, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.044198895027624, |
| "grad_norm": 0.77734375, |
| "learning_rate": 5.586738622746042e-06, |
| "loss": 0.891, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.0718232044198897, |
| "grad_norm": 0.80078125, |
| "learning_rate": 5.300232569726805e-06, |
| "loss": 0.8843, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.0994475138121547, |
| "grad_norm": 0.9140625, |
| "learning_rate": 5.0185955077929774e-06, |
| "loss": 0.8696, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.12707182320442, |
| "grad_norm": 0.7578125, |
| "learning_rate": 4.742119214860009e-06, |
| "loss": 0.8775, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.154696132596685, |
| "grad_norm": 0.7578125, |
| "learning_rate": 4.471090122251496e-06, |
| "loss": 0.8797, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.18232044198895, |
| "grad_norm": 0.78125, |
| "learning_rate": 4.205789017954364e-06, |
| "loss": 0.8832, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.2099447513812156, |
| "grad_norm": 0.859375, |
| "learning_rate": 3.946490755720621e-06, |
| "loss": 0.884, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.2375690607734806, |
| "grad_norm": 0.9453125, |
| "learning_rate": 3.6934639703169905e-06, |
| "loss": 0.8737, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.265193370165746, |
| "grad_norm": 0.86328125, |
| "learning_rate": 3.4469707992174607e-06, |
| "loss": 0.8981, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.292817679558011, |
| "grad_norm": 0.80078125, |
| "learning_rate": 3.207266611027069e-06, |
| "loss": 0.8736, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.320441988950276, |
| "grad_norm": 0.78125, |
| "learning_rate": 2.97459974091831e-06, |
| "loss": 0.8757, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.3480662983425415, |
| "grad_norm": 0.8203125, |
| "learning_rate": 2.7492112333541744e-06, |
| "loss": 0.902, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.3756906077348066, |
| "grad_norm": 0.90625, |
| "learning_rate": 2.531334592364457e-06, |
| "loss": 0.8766, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.403314917127072, |
| "grad_norm": 0.7734375, |
| "learning_rate": 2.3211955396340003e-06, |
| "loss": 0.8982, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.430939226519337, |
| "grad_norm": 0.74609375, |
| "learning_rate": 2.1190117806534714e-06, |
| "loss": 0.8801, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.458563535911602, |
| "grad_norm": 0.828125, |
| "learning_rate": 1.924992779174999e-06, |
| "loss": 0.8707, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.4861878453038675, |
| "grad_norm": 0.859375, |
| "learning_rate": 1.7393395402063085e-06, |
| "loss": 0.8939, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.5138121546961325, |
| "grad_norm": 0.859375, |
| "learning_rate": 1.5622444017681438e-06, |
| "loss": 0.8707, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.541436464088398, |
| "grad_norm": 0.78515625, |
| "learning_rate": 1.3938908356307846e-06, |
| "loss": 0.8771, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.569060773480663, |
| "grad_norm": 0.76171875, |
| "learning_rate": 1.2344532572360325e-06, |
| "loss": 0.857, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.596685082872928, |
| "grad_norm": 0.83984375, |
| "learning_rate": 1.0840968450016276e-06, |
| "loss": 0.885, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.6243093922651934, |
| "grad_norm": 0.78125, |
| "learning_rate": 9.42977369195286e-07, |
| "loss": 0.9007, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.6519337016574585, |
| "grad_norm": 0.90234375, |
| "learning_rate": 8.112410305556307e-07, |
| "loss": 0.8988, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.679558011049724, |
| "grad_norm": 0.78125, |
| "learning_rate": 6.890243088272453e-07, |
| "loss": 0.8702, |
| "step": 485 |
| }, |
| { |
| "epoch": 2.707182320441989, |
| "grad_norm": 0.875, |
| "learning_rate": 5.764538213667103e-07, |
| "loss": 0.8981, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.734806629834254, |
| "grad_norm": 0.89453125, |
| "learning_rate": 4.73646191966175e-07, |
| "loss": 0.8912, |
| "step": 495 |
| }, |
| { |
| "epoch": 2.7624309392265194, |
| "grad_norm": 0.875, |
| "learning_rate": 3.8070793003030296e-07, |
| "loss": 0.8967, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.7900552486187844, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.9773532023180897e-07, |
| "loss": 0.8933, |
| "step": 505 |
| }, |
| { |
| "epoch": 2.81767955801105, |
| "grad_norm": 0.7578125, |
| "learning_rate": 2.248143227598809e-07, |
| "loss": 0.9039, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.845303867403315, |
| "grad_norm": 0.7578125, |
| "learning_rate": 1.6202048426483652e-07, |
| "loss": 0.8864, |
| "step": 515 |
| }, |
| { |
| "epoch": 2.87292817679558, |
| "grad_norm": 0.8515625, |
| "learning_rate": 1.094188595912804e-07, |
| "loss": 0.8997, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.9005524861878453, |
| "grad_norm": 0.87109375, |
| "learning_rate": 6.706394438083962e-08, |
| "loss": 0.8805, |
| "step": 525 |
| }, |
| { |
| "epoch": 2.9281767955801103, |
| "grad_norm": 0.7890625, |
| "learning_rate": 3.4999618614309784e-08, |
| "loss": 0.8959, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.955801104972376, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.325910115169471e-08, |
| "loss": 0.8552, |
| "step": 535 |
| }, |
| { |
| "epoch": 2.983425414364641, |
| "grad_norm": 0.82421875, |
| "learning_rate": 1.8649153172423106e-09, |
| "loss": 0.8753, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 543, |
| "total_flos": 2.5856227500097536e+16, |
| "train_loss": 1.017678025019103, |
| "train_runtime": 224.2528, |
| "train_samples_per_second": 38.568, |
| "train_steps_per_second": 2.421 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 543, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.5856227500097536e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|