| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 733, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0068212824010914054, | |
| "grad_norm": 2.859520152533828, | |
| "learning_rate": 6.7567567567567575e-06, | |
| "loss": 0.8358, | |
| "num_tokens": 3759146.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.013642564802182811, | |
| "grad_norm": 1.9018163630483922, | |
| "learning_rate": 1.3513513513513515e-05, | |
| "loss": 0.7639, | |
| "num_tokens": 7668808.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020463847203274217, | |
| "grad_norm": 0.5801403334832769, | |
| "learning_rate": 2.0270270270270273e-05, | |
| "loss": 0.6614, | |
| "num_tokens": 11368873.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.027285129604365622, | |
| "grad_norm": 0.572102113014263, | |
| "learning_rate": 2.702702702702703e-05, | |
| "loss": 0.6119, | |
| "num_tokens": 15118063.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.034106412005457026, | |
| "grad_norm": 0.4278039947946872, | |
| "learning_rate": 3.3783783783783784e-05, | |
| "loss": 0.5902, | |
| "num_tokens": 18906839.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.040927694406548434, | |
| "grad_norm": 0.38997902119107797, | |
| "learning_rate": 4.0540540540540545e-05, | |
| "loss": 0.5683, | |
| "num_tokens": 22641755.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.047748976807639835, | |
| "grad_norm": 0.3731809054939738, | |
| "learning_rate": 4.72972972972973e-05, | |
| "loss": 0.5642, | |
| "num_tokens": 26636629.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.054570259208731244, | |
| "grad_norm": 0.3482748515886515, | |
| "learning_rate": 4.999793714044176e-05, | |
| "loss": 0.5359, | |
| "num_tokens": 30417967.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.061391541609822645, | |
| "grad_norm": 0.37293778656884924, | |
| "learning_rate": 4.9985332146267735e-05, | |
| "loss": 0.5384, | |
| "num_tokens": 34231333.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.06821282401091405, | |
| "grad_norm": 0.36563686554902025, | |
| "learning_rate": 4.996127460337901e-05, | |
| "loss": 0.539, | |
| "num_tokens": 37961424.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07503410641200546, | |
| "grad_norm": 0.49916502547033137, | |
| "learning_rate": 4.992577676510502e-05, | |
| "loss": 0.5403, | |
| "num_tokens": 41826860.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08185538881309687, | |
| "grad_norm": 0.4663794674995223, | |
| "learning_rate": 4.987885671170889e-05, | |
| "loss": 0.5286, | |
| "num_tokens": 45543403.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08867667121418826, | |
| "grad_norm": 0.47229935539155693, | |
| "learning_rate": 4.9820538341178595e-05, | |
| "loss": 0.5321, | |
| "num_tokens": 49369486.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.09549795361527967, | |
| "grad_norm": 0.3986951551087616, | |
| "learning_rate": 4.97508513570549e-05, | |
| "loss": 0.5227, | |
| "num_tokens": 53010874.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10231923601637108, | |
| "grad_norm": 0.40140582830322363, | |
| "learning_rate": 4.966983125330225e-05, | |
| "loss": 0.5244, | |
| "num_tokens": 56909889.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.10914051841746249, | |
| "grad_norm": 0.4349545422715694, | |
| "learning_rate": 4.957751929623059e-05, | |
| "loss": 0.5094, | |
| "num_tokens": 60650570.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11596180081855388, | |
| "grad_norm": 0.3803902727248126, | |
| "learning_rate": 4.947396250347695e-05, | |
| "loss": 0.5033, | |
| "num_tokens": 64564660.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.12278308321964529, | |
| "grad_norm": 0.3736381760976187, | |
| "learning_rate": 4.9359213620057766e-05, | |
| "loss": 0.5192, | |
| "num_tokens": 68426882.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1296043656207367, | |
| "grad_norm": 0.33996053767832096, | |
| "learning_rate": 4.9233331091504034e-05, | |
| "loss": 0.5154, | |
| "num_tokens": 72252819.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1364256480218281, | |
| "grad_norm": 0.4698197228973068, | |
| "learning_rate": 4.909637903409306e-05, | |
| "loss": 0.504, | |
| "num_tokens": 76160914.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1432469304229195, | |
| "grad_norm": 0.3493439695910741, | |
| "learning_rate": 4.8948427202191766e-05, | |
| "loss": 0.5057, | |
| "num_tokens": 80152955.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.15006821282401092, | |
| "grad_norm": 0.3516586136746583, | |
| "learning_rate": 4.878955095272844e-05, | |
| "loss": 0.5098, | |
| "num_tokens": 83901702.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15688949522510232, | |
| "grad_norm": 0.42408743833551077, | |
| "learning_rate": 4.861983120681089e-05, | |
| "loss": 0.5088, | |
| "num_tokens": 87691018.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.16371077762619374, | |
| "grad_norm": 0.3475562440630165, | |
| "learning_rate": 4.8439354408510536e-05, | |
| "loss": 0.4976, | |
| "num_tokens": 91542428.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17053206002728513, | |
| "grad_norm": 0.36672881758295617, | |
| "learning_rate": 4.82482124808335e-05, | |
| "loss": 0.5047, | |
| "num_tokens": 95438170.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.17735334242837653, | |
| "grad_norm": 0.3716213307471467, | |
| "learning_rate": 4.804650277890105e-05, | |
| "loss": 0.4993, | |
| "num_tokens": 99383692.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18417462482946795, | |
| "grad_norm": 0.4114376174123406, | |
| "learning_rate": 4.783432804036335e-05, | |
| "loss": 0.4997, | |
| "num_tokens": 103223537.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.19099590723055934, | |
| "grad_norm": 0.417963130595086, | |
| "learning_rate": 4.761179633307163e-05, | |
| "loss": 0.511, | |
| "num_tokens": 106901687.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.19781718963165076, | |
| "grad_norm": 0.3590365052314863, | |
| "learning_rate": 4.737902100003552e-05, | |
| "loss": 0.4863, | |
| "num_tokens": 110840758.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.20463847203274216, | |
| "grad_norm": 0.3336081440893747, | |
| "learning_rate": 4.713612060169362e-05, | |
| "loss": 0.5005, | |
| "num_tokens": 114521504.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.21145975443383355, | |
| "grad_norm": 0.3185236733259455, | |
| "learning_rate": 4.688321885552659e-05, | |
| "loss": 0.4875, | |
| "num_tokens": 118445759.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.21828103683492497, | |
| "grad_norm": 0.3596258991525576, | |
| "learning_rate": 4.662044457304359e-05, | |
| "loss": 0.4952, | |
| "num_tokens": 122311693.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.22510231923601637, | |
| "grad_norm": 0.31478964913575963, | |
| "learning_rate": 4.634793159417421e-05, | |
| "loss": 0.498, | |
| "num_tokens": 126094524.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.23192360163710776, | |
| "grad_norm": 0.339405025728744, | |
| "learning_rate": 4.606581871909919e-05, | |
| "loss": 0.492, | |
| "num_tokens": 129971056.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.23874488403819918, | |
| "grad_norm": 0.3817665665912771, | |
| "learning_rate": 4.577424963755475e-05, | |
| "loss": 0.5052, | |
| "num_tokens": 133765982.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.24556616643929058, | |
| "grad_norm": 0.455088588103575, | |
| "learning_rate": 4.547337285564649e-05, | |
| "loss": 0.4874, | |
| "num_tokens": 137522281.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.252387448840382, | |
| "grad_norm": 0.3863705096212881, | |
| "learning_rate": 4.516334162021013e-05, | |
| "loss": 0.4826, | |
| "num_tokens": 141196084.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2592087312414734, | |
| "grad_norm": 0.36548889219989195, | |
| "learning_rate": 4.484431384075771e-05, | |
| "loss": 0.4923, | |
| "num_tokens": 145136704.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2660300136425648, | |
| "grad_norm": 0.3282797740156995, | |
| "learning_rate": 4.4516452009048814e-05, | |
| "loss": 0.4813, | |
| "num_tokens": 148940122.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2728512960436562, | |
| "grad_norm": 0.3373945226676213, | |
| "learning_rate": 4.4179923116328005e-05, | |
| "loss": 0.4911, | |
| "num_tokens": 152809678.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.27967257844474763, | |
| "grad_norm": 0.2988913432631945, | |
| "learning_rate": 4.3834898568270444e-05, | |
| "loss": 0.4848, | |
| "num_tokens": 156683573.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.286493860845839, | |
| "grad_norm": 0.32252594879192, | |
| "learning_rate": 4.348155409767913e-05, | |
| "loss": 0.486, | |
| "num_tokens": 160433326.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2933151432469304, | |
| "grad_norm": 0.30855373605724573, | |
| "learning_rate": 4.3120069674978156e-05, | |
| "loss": 0.4883, | |
| "num_tokens": 164374316.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.30013642564802184, | |
| "grad_norm": 0.30234329464930193, | |
| "learning_rate": 4.275062941654767e-05, | |
| "loss": 0.4702, | |
| "num_tokens": 168223299.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3069577080491132, | |
| "grad_norm": 0.2841369078220348, | |
| "learning_rate": 4.237342149094701e-05, | |
| "loss": 0.4815, | |
| "num_tokens": 172132000.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.31377899045020463, | |
| "grad_norm": 0.32791055661062934, | |
| "learning_rate": 4.1988638023074116e-05, | |
| "loss": 0.4787, | |
| "num_tokens": 176086331.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.32060027285129605, | |
| "grad_norm": 0.3356851421085584, | |
| "learning_rate": 4.159647499630971e-05, | |
| "loss": 0.4708, | |
| "num_tokens": 179917640.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3274215552523875, | |
| "grad_norm": 0.3037114872946393, | |
| "learning_rate": 4.1197132152696215e-05, | |
| "loss": 0.4822, | |
| "num_tokens": 183746129.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.33424283765347884, | |
| "grad_norm": 0.30572141355536847, | |
| "learning_rate": 4.07908128912024e-05, | |
| "loss": 0.4895, | |
| "num_tokens": 187699370.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.34106412005457026, | |
| "grad_norm": 0.3102442024034853, | |
| "learning_rate": 4.037772416412524e-05, | |
| "loss": 0.4739, | |
| "num_tokens": 191512142.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3478854024556617, | |
| "grad_norm": 0.3059715305515258, | |
| "learning_rate": 3.995807637168205e-05, | |
| "loss": 0.4751, | |
| "num_tokens": 195367749.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.35470668485675305, | |
| "grad_norm": 0.2904211221393888, | |
| "learning_rate": 3.9532083254846505e-05, | |
| "loss": 0.4648, | |
| "num_tokens": 199215371.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3615279672578445, | |
| "grad_norm": 0.33681948143185453, | |
| "learning_rate": 3.909996178648299e-05, | |
| "loss": 0.4826, | |
| "num_tokens": 202903048.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3683492496589359, | |
| "grad_norm": 0.3285871766466283, | |
| "learning_rate": 3.866193206083494e-05, | |
| "loss": 0.4727, | |
| "num_tokens": 206761213.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.37517053206002726, | |
| "grad_norm": 0.33670385009296144, | |
| "learning_rate": 3.821821718142332e-05, | |
| "loss": 0.4694, | |
| "num_tokens": 210585632.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3819918144611187, | |
| "grad_norm": 0.3305593264219332, | |
| "learning_rate": 3.77690431474123e-05, | |
| "loss": 0.4744, | |
| "num_tokens": 214261738.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3888130968622101, | |
| "grad_norm": 0.29054732905430486, | |
| "learning_rate": 3.7314638738500265e-05, | |
| "loss": 0.479, | |
| "num_tokens": 218144024.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3956343792633015, | |
| "grad_norm": 0.2919105758915809, | |
| "learning_rate": 3.685523539839439e-05, | |
| "loss": 0.4752, | |
| "num_tokens": 222057295.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4024556616643929, | |
| "grad_norm": 0.2954441471813468, | |
| "learning_rate": 3.63910671169285e-05, | |
| "loss": 0.4671, | |
| "num_tokens": 225828573.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4092769440654843, | |
| "grad_norm": 0.33726151653757314, | |
| "learning_rate": 3.5922370310884014e-05, | |
| "loss": 0.4664, | |
| "num_tokens": 229710487.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.41609822646657574, | |
| "grad_norm": 0.2994069615286232, | |
| "learning_rate": 3.5449383703574806e-05, | |
| "loss": 0.4801, | |
| "num_tokens": 233617525.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4229195088676671, | |
| "grad_norm": 0.321770217300059, | |
| "learning_rate": 3.4972348203257274e-05, | |
| "loss": 0.4774, | |
| "num_tokens": 237394471.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4297407912687585, | |
| "grad_norm": 0.3342140614239043, | |
| "learning_rate": 3.449150678042748e-05, | |
| "loss": 0.4732, | |
| "num_tokens": 241114261.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.43656207366984995, | |
| "grad_norm": 0.311787983749093, | |
| "learning_rate": 3.400710434406803e-05, | |
| "loss": 0.4727, | |
| "num_tokens": 244967987.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4433833560709413, | |
| "grad_norm": 0.3125973899622851, | |
| "learning_rate": 3.351938761690748e-05, | |
| "loss": 0.4789, | |
| "num_tokens": 248751095.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.45020463847203274, | |
| "grad_norm": 0.3128243315361879, | |
| "learning_rate": 3.302860500975605e-05, | |
| "loss": 0.4678, | |
| "num_tokens": 252607265.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.45702592087312416, | |
| "grad_norm": 0.2899975717949231, | |
| "learning_rate": 3.253500649498153e-05, | |
| "loss": 0.4736, | |
| "num_tokens": 256417909.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.4638472032742155, | |
| "grad_norm": 0.2947121469603074, | |
| "learning_rate": 3.203884347918975e-05, | |
| "loss": 0.4663, | |
| "num_tokens": 260456429.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.47066848567530695, | |
| "grad_norm": 0.2747115046593522, | |
| "learning_rate": 3.154036867517462e-05, | |
| "loss": 0.4601, | |
| "num_tokens": 264287905.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.47748976807639837, | |
| "grad_norm": 0.2684844916597318, | |
| "learning_rate": 3.1039835973202865e-05, | |
| "loss": 0.4689, | |
| "num_tokens": 268098790.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4843110504774898, | |
| "grad_norm": 0.271171033537078, | |
| "learning_rate": 3.053750031169903e-05, | |
| "loss": 0.4769, | |
| "num_tokens": 271974065.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.49113233287858116, | |
| "grad_norm": 0.29862281272985114, | |
| "learning_rate": 3.0033617547396614e-05, | |
| "loss": 0.4804, | |
| "num_tokens": 275852045.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4979536152796726, | |
| "grad_norm": 0.2791498620793116, | |
| "learning_rate": 2.9528444325021477e-05, | |
| "loss": 0.4603, | |
| "num_tokens": 279504484.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.504774897680764, | |
| "grad_norm": 0.27773797940501777, | |
| "learning_rate": 2.902223794657391e-05, | |
| "loss": 0.4623, | |
| "num_tokens": 283461546.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5115961800818554, | |
| "grad_norm": 0.2501959816908535, | |
| "learning_rate": 2.8515256240275946e-05, | |
| "loss": 0.4692, | |
| "num_tokens": 287371918.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5184174624829468, | |
| "grad_norm": 0.2508499415881564, | |
| "learning_rate": 2.8007757429250597e-05, | |
| "loss": 0.4575, | |
| "num_tokens": 291057738.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5252387448840382, | |
| "grad_norm": 0.23841623530318584, | |
| "learning_rate": 2.7500000000000004e-05, | |
| "loss": 0.4657, | |
| "num_tokens": 294881963.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5320600272851296, | |
| "grad_norm": 0.27722631135406894, | |
| "learning_rate": 2.699224257074941e-05, | |
| "loss": 0.4666, | |
| "num_tokens": 298677895.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.538881309686221, | |
| "grad_norm": 0.26944884620681187, | |
| "learning_rate": 2.6484743759724062e-05, | |
| "loss": 0.4528, | |
| "num_tokens": 302387985.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5457025920873124, | |
| "grad_norm": 0.2683881461913158, | |
| "learning_rate": 2.5977762053426098e-05, | |
| "loss": 0.4698, | |
| "num_tokens": 306130593.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5525238744884038, | |
| "grad_norm": 0.2292317979583899, | |
| "learning_rate": 2.547155567497854e-05, | |
| "loss": 0.4706, | |
| "num_tokens": 309952008.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5593451568894953, | |
| "grad_norm": 0.262653672464314, | |
| "learning_rate": 2.496638245260339e-05, | |
| "loss": 0.4576, | |
| "num_tokens": 313725489.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5661664392905866, | |
| "grad_norm": 0.31628174437057555, | |
| "learning_rate": 2.446249968830097e-05, | |
| "loss": 0.4621, | |
| "num_tokens": 317525148.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.572987721691678, | |
| "grad_norm": 0.2971582407388678, | |
| "learning_rate": 2.3960164026797137e-05, | |
| "loss": 0.4625, | |
| "num_tokens": 321410101.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5798090040927695, | |
| "grad_norm": 0.2736258492313367, | |
| "learning_rate": 2.3459631324825388e-05, | |
| "loss": 0.4579, | |
| "num_tokens": 325102278.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5866302864938608, | |
| "grad_norm": 0.27230747471072914, | |
| "learning_rate": 2.2961156520810255e-05, | |
| "loss": 0.4623, | |
| "num_tokens": 328831071.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5934515688949522, | |
| "grad_norm": 0.2870356128439239, | |
| "learning_rate": 2.246499350501848e-05, | |
| "loss": 0.4527, | |
| "num_tokens": 332767044.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6002728512960437, | |
| "grad_norm": 0.23826692416074355, | |
| "learning_rate": 2.197139499024396e-05, | |
| "loss": 0.4503, | |
| "num_tokens": 336595638.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.607094133697135, | |
| "grad_norm": 0.26903175563152104, | |
| "learning_rate": 2.1480612383092536e-05, | |
| "loss": 0.4621, | |
| "num_tokens": 340358925.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6139154160982264, | |
| "grad_norm": 0.23858645047531596, | |
| "learning_rate": 2.0992895655931984e-05, | |
| "loss": 0.4606, | |
| "num_tokens": 344239058.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6207366984993179, | |
| "grad_norm": 0.26224623620936377, | |
| "learning_rate": 2.0508493219572522e-05, | |
| "loss": 0.4638, | |
| "num_tokens": 348080585.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6275579809004093, | |
| "grad_norm": 0.22212001743500964, | |
| "learning_rate": 2.0027651796742735e-05, | |
| "loss": 0.4578, | |
| "num_tokens": 351817695.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6343792633015006, | |
| "grad_norm": 0.24324689513326436, | |
| "learning_rate": 1.95506162964252e-05, | |
| "loss": 0.4537, | |
| "num_tokens": 355639183.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6412005457025921, | |
| "grad_norm": 0.2377209139662622, | |
| "learning_rate": 1.9077629689115995e-05, | |
| "loss": 0.4697, | |
| "num_tokens": 359437581.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6480218281036835, | |
| "grad_norm": 0.2525718316267335, | |
| "learning_rate": 1.8608932883071507e-05, | |
| "loss": 0.4483, | |
| "num_tokens": 363189983.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.654843110504775, | |
| "grad_norm": 0.24386193951321106, | |
| "learning_rate": 1.8144764601605613e-05, | |
| "loss": 0.4503, | |
| "num_tokens": 366863209.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6616643929058663, | |
| "grad_norm": 0.22335725928265934, | |
| "learning_rate": 1.7685361261499733e-05, | |
| "loss": 0.4631, | |
| "num_tokens": 370725860.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6684856753069577, | |
| "grad_norm": 0.22739365537182124, | |
| "learning_rate": 1.72309568525877e-05, | |
| "loss": 0.4493, | |
| "num_tokens": 374566515.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6753069577080492, | |
| "grad_norm": 0.2518810180466547, | |
| "learning_rate": 1.6781782818576686e-05, | |
| "loss": 0.4434, | |
| "num_tokens": 378330489.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6821282401091405, | |
| "grad_norm": 0.22437165048428956, | |
| "learning_rate": 1.6338067939165058e-05, | |
| "loss": 0.4475, | |
| "num_tokens": 382103468.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6889495225102319, | |
| "grad_norm": 0.24484654505585532, | |
| "learning_rate": 1.590003821351701e-05, | |
| "loss": 0.4558, | |
| "num_tokens": 385837297.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6957708049113234, | |
| "grad_norm": 0.25262167850764056, | |
| "learning_rate": 1.54679167451535e-05, | |
| "loss": 0.4522, | |
| "num_tokens": 389586680.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7025920873124147, | |
| "grad_norm": 0.25644327559623603, | |
| "learning_rate": 1.5041923628317948e-05, | |
| "loss": 0.4569, | |
| "num_tokens": 393428760.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7094133697135061, | |
| "grad_norm": 0.24914618478726863, | |
| "learning_rate": 1.4622275835874766e-05, | |
| "loss": 0.4677, | |
| "num_tokens": 397158700.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7162346521145976, | |
| "grad_norm": 0.2404286713923951, | |
| "learning_rate": 1.4209187108797607e-05, | |
| "loss": 0.4533, | |
| "num_tokens": 400923938.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.723055934515689, | |
| "grad_norm": 0.22063525670425096, | |
| "learning_rate": 1.3802867847303785e-05, | |
| "loss": 0.4483, | |
| "num_tokens": 404685655.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7298772169167803, | |
| "grad_norm": 0.21801652381587655, | |
| "learning_rate": 1.3403525003690304e-05, | |
| "loss": 0.4532, | |
| "num_tokens": 408582799.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7366984993178718, | |
| "grad_norm": 0.20892765769673624, | |
| "learning_rate": 1.3011361976925884e-05, | |
| "loss": 0.4584, | |
| "num_tokens": 412386009.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7435197817189632, | |
| "grad_norm": 0.21186026658080837, | |
| "learning_rate": 1.2626578509052997e-05, | |
| "loss": 0.4603, | |
| "num_tokens": 416372039.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7503410641200545, | |
| "grad_norm": 0.21768800059481686, | |
| "learning_rate": 1.2249370583452342e-05, | |
| "loss": 0.4468, | |
| "num_tokens": 420051975.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.757162346521146, | |
| "grad_norm": 0.2204584535704754, | |
| "learning_rate": 1.1879930325021841e-05, | |
| "loss": 0.447, | |
| "num_tokens": 423685709.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7639836289222374, | |
| "grad_norm": 0.24319632994282764, | |
| "learning_rate": 1.1518445902320878e-05, | |
| "loss": 0.4439, | |
| "num_tokens": 427405904.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7708049113233287, | |
| "grad_norm": 0.23395443194660598, | |
| "learning_rate": 1.1165101431729561e-05, | |
| "loss": 0.4442, | |
| "num_tokens": 431288378.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.7776261937244202, | |
| "grad_norm": 0.2353220319536335, | |
| "learning_rate": 1.0820076883671999e-05, | |
| "loss": 0.4467, | |
| "num_tokens": 435077995.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7844474761255116, | |
| "grad_norm": 0.23920963211303678, | |
| "learning_rate": 1.0483547990951195e-05, | |
| "loss": 0.4464, | |
| "num_tokens": 439006864.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.791268758526603, | |
| "grad_norm": 0.23845367222027455, | |
| "learning_rate": 1.0155686159242297e-05, | |
| "loss": 0.4602, | |
| "num_tokens": 443045688.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7980900409276944, | |
| "grad_norm": 0.23878766406558552, | |
| "learning_rate": 9.836658379789875e-06, | |
| "loss": 0.4487, | |
| "num_tokens": 446852018.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8049113233287858, | |
| "grad_norm": 0.21045206753906473, | |
| "learning_rate": 9.52662714435352e-06, | |
| "loss": 0.4619, | |
| "num_tokens": 450687287.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8117326057298773, | |
| "grad_norm": 0.24180115500464866, | |
| "learning_rate": 9.225750362445255e-06, | |
| "loss": 0.4478, | |
| "num_tokens": 454483896.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8185538881309686, | |
| "grad_norm": 0.22414629123508337, | |
| "learning_rate": 8.93418128090081e-06, | |
| "loss": 0.4464, | |
| "num_tokens": 458325861.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.82537517053206, | |
| "grad_norm": 0.22622883948448075, | |
| "learning_rate": 8.652068405825798e-06, | |
| "loss": 0.4519, | |
| "num_tokens": 462055868.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8321964529331515, | |
| "grad_norm": 0.2002504054474251, | |
| "learning_rate": 8.379555426956415e-06, | |
| "loss": 0.4461, | |
| "num_tokens": 465973876.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8390177353342428, | |
| "grad_norm": 0.2196686538554551, | |
| "learning_rate": 8.11678114447342e-06, | |
| "loss": 0.4446, | |
| "num_tokens": 469791498.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8458390177353342, | |
| "grad_norm": 0.21537486730907285, | |
| "learning_rate": 7.863879398306385e-06, | |
| "loss": 0.4419, | |
| "num_tokens": 473570581.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8526603001364257, | |
| "grad_norm": 0.2386317506871887, | |
| "learning_rate": 7.620978999964487e-06, | |
| "loss": 0.4558, | |
| "num_tokens": 477401353.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.859481582537517, | |
| "grad_norm": 0.20977878997335292, | |
| "learning_rate": 7.3882036669283754e-06, | |
| "loss": 0.4553, | |
| "num_tokens": 481348892.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8663028649386084, | |
| "grad_norm": 0.21563339045910554, | |
| "learning_rate": 7.16567195963665e-06, | |
| "loss": 0.4544, | |
| "num_tokens": 485171910.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.8731241473396999, | |
| "grad_norm": 0.20119697622543456, | |
| "learning_rate": 6.953497221098949e-06, | |
| "loss": 0.4413, | |
| "num_tokens": 489059548.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8799454297407913, | |
| "grad_norm": 0.20245196883946875, | |
| "learning_rate": 6.751787519166505e-06, | |
| "loss": 0.4431, | |
| "num_tokens": 492893029.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.8867667121418826, | |
| "grad_norm": 0.2144202772255573, | |
| "learning_rate": 6.560645591489468e-06, | |
| "loss": 0.45, | |
| "num_tokens": 496905674.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8935879945429741, | |
| "grad_norm": 0.1960337560239353, | |
| "learning_rate": 6.380168793189115e-06, | |
| "loss": 0.4464, | |
| "num_tokens": 500864542.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9004092769440655, | |
| "grad_norm": 0.19741690445074656, | |
| "learning_rate": 6.210449047271566e-06, | |
| "loss": 0.4492, | |
| "num_tokens": 504810366.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9072305593451568, | |
| "grad_norm": 0.203527801548371, | |
| "learning_rate": 6.0515727978082415e-06, | |
| "loss": 0.446, | |
| "num_tokens": 508607232.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.9140518417462483, | |
| "grad_norm": 0.2114927842592368, | |
| "learning_rate": 5.9036209659069404e-06, | |
| "loss": 0.4519, | |
| "num_tokens": 512474084.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9208731241473397, | |
| "grad_norm": 0.21111293452840946, | |
| "learning_rate": 5.766668908495966e-06, | |
| "loss": 0.4438, | |
| "num_tokens": 516216104.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.927694406548431, | |
| "grad_norm": 0.2020840535619729, | |
| "learning_rate": 5.64078637994224e-06, | |
| "loss": 0.453, | |
| "num_tokens": 519966345.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9345156889495225, | |
| "grad_norm": 0.19990572835537956, | |
| "learning_rate": 5.526037496523051e-06, | |
| "loss": 0.4393, | |
| "num_tokens": 523793837.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.9413369713506139, | |
| "grad_norm": 0.21501511467608375, | |
| "learning_rate": 5.422480703769408e-06, | |
| "loss": 0.4523, | |
| "num_tokens": 527666864.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9481582537517054, | |
| "grad_norm": 0.20500881616784905, | |
| "learning_rate": 5.330168746697747e-06, | |
| "loss": 0.4494, | |
| "num_tokens": 531466359.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.9549795361527967, | |
| "grad_norm": 0.19384439762912464, | |
| "learning_rate": 5.249148642945106e-06, | |
| "loss": 0.4513, | |
| "num_tokens": 535253217.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9618008185538881, | |
| "grad_norm": 0.20677675163076573, | |
| "learning_rate": 5.179461658821403e-06, | |
| "loss": 0.4372, | |
| "num_tokens": 539137436.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.9686221009549796, | |
| "grad_norm": 0.2176204558853245, | |
| "learning_rate": 5.121143288291119e-06, | |
| "loss": 0.447, | |
| "num_tokens": 542824098.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.975443383356071, | |
| "grad_norm": 0.20886934132711027, | |
| "learning_rate": 5.07422323489499e-06, | |
| "loss": 0.4566, | |
| "num_tokens": 546578047.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.9822646657571623, | |
| "grad_norm": 0.20458382129158925, | |
| "learning_rate": 5.03872539662099e-06, | |
| "loss": 0.4446, | |
| "num_tokens": 550377811.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9890859481582538, | |
| "grad_norm": 0.18980320361368563, | |
| "learning_rate": 5.014667853732269e-06, | |
| "loss": 0.4403, | |
| "num_tokens": 554224024.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.9959072305593452, | |
| "grad_norm": 0.18951137695952713, | |
| "learning_rate": 5.00206285955824e-06, | |
| "loss": 0.4485, | |
| "num_tokens": 558001366.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_tokens": 560311272.0, | |
| "step": 733, | |
| "total_flos": 1913806159609856.0, | |
| "train_loss": 0.48002010657061983, | |
| "train_runtime": 43121.1313, | |
| "train_samples_per_second": 2.174, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 733, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1913806159609856.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |