diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4002421707303649, + "epoch": 0.4502724420716605, "eval_steps": 500, - "global_step": 7272, + "global_step": 8181, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -50911,6 +50911,6369 @@ "learning_rate": 9.06425972327535e-06, "loss": 0.8406, "step": 7272 + }, + { + "epoch": 0.4002972095327206, + "grad_norm": 0.7348741292953491, + "learning_rate": 9.064007226547819e-06, + "loss": 0.8103, + "step": 7273 + }, + { + "epoch": 0.4003522483350762, + "grad_norm": 0.6561787724494934, + "learning_rate": 9.063754699276297e-06, + "loss": 0.6634, + "step": 7274 + }, + { + "epoch": 0.4004072871374319, + "grad_norm": 0.7924866080284119, + "learning_rate": 9.063502141462682e-06, + "loss": 0.6592, + "step": 7275 + }, + { + "epoch": 0.40046232593978753, + "grad_norm": 0.6873973608016968, + "learning_rate": 9.063249553108873e-06, + "loss": 0.7912, + "step": 7276 + }, + { + "epoch": 0.4005173647421432, + "grad_norm": 0.6872708797454834, + "learning_rate": 9.062996934216768e-06, + "loss": 0.732, + "step": 7277 + }, + { + "epoch": 0.40057240354449886, + "grad_norm": 0.7381585836410522, + "learning_rate": 9.062744284788265e-06, + "loss": 0.84, + "step": 7278 + }, + { + "epoch": 0.40062744234685455, + "grad_norm": 0.7885964512825012, + "learning_rate": 9.062491604825266e-06, + "loss": 0.8229, + "step": 7279 + }, + { + "epoch": 0.4006824811492102, + "grad_norm": 0.9066407680511475, + "learning_rate": 9.062238894329664e-06, + "loss": 0.7299, + "step": 7280 + }, + { + "epoch": 0.40073751995156587, + "grad_norm": 0.7694007754325867, + "learning_rate": 9.061986153303364e-06, + "loss": 0.8033, + "step": 7281 + }, + { + "epoch": 0.4007925587539215, + "grad_norm": 1.021766185760498, + "learning_rate": 9.061733381748263e-06, + "loss": 0.79, + "step": 7282 + }, + { + "epoch": 0.4008475975562772, + "grad_norm": 0.7776662111282349, + "learning_rate": 9.06148057966626e-06, + "loss": 0.8484, + "step": 7283 + }, + { + "epoch": 0.4009026363586328, + "grad_norm": 0.8646043539047241, + "learning_rate": 9.061227747059257e-06, + "loss": 0.8223, + "step": 7284 + }, + { + "epoch": 0.4009576751609885, + "grad_norm": 0.7347257733345032, + "learning_rate": 9.060974883929154e-06, + "loss": 0.8062, + "step": 7285 + }, + { + "epoch": 0.40101271396334415, + "grad_norm": 0.8233902454376221, + "learning_rate": 9.06072199027785e-06, + "loss": 0.8922, + "step": 7286 + }, + { + "epoch": 0.40106775276569984, + "grad_norm": 0.7099601030349731, + "learning_rate": 9.060469066107246e-06, + "loss": 0.7125, + "step": 7287 + }, + { + "epoch": 0.40112279156805547, + "grad_norm": 0.7549998164176941, + "learning_rate": 9.060216111419246e-06, + "loss": 0.7851, + "step": 7288 + }, + { + "epoch": 0.40117783037041116, + "grad_norm": 0.753516435623169, + "learning_rate": 9.059963126215748e-06, + "loss": 0.7831, + "step": 7289 + }, + { + "epoch": 0.4012328691727668, + "grad_norm": 0.6718429327011108, + "learning_rate": 9.059710110498651e-06, + "loss": 0.7305, + "step": 7290 + }, + { + "epoch": 0.4012879079751225, + "grad_norm": 0.6796036958694458, + "learning_rate": 9.05945706426986e-06, + "loss": 0.802, + "step": 7291 + }, + { + "epoch": 0.4013429467774781, + "grad_norm": 0.8046827912330627, + "learning_rate": 9.05920398753128e-06, + "loss": 0.7286, + "step": 7292 + }, + { + "epoch": 0.4013979855798338, + "grad_norm": 0.7518643140792847, + "learning_rate": 9.058950880284807e-06, + "loss": 0.7287, + "step": 7293 + }, + { + "epoch": 0.40145302438218944, + "grad_norm": 0.8386855125427246, + "learning_rate": 9.058697742532345e-06, + "loss": 0.8201, + "step": 7294 + }, + { + "epoch": 0.4015080631845451, + "grad_norm": 0.7780192494392395, + "learning_rate": 9.058444574275797e-06, + "loss": 0.7999, + "step": 7295 + }, + { + "epoch": 0.40156310198690076, + "grad_norm": 0.7715566754341125, + "learning_rate": 9.058191375517068e-06, + "loss": 0.732, + "step": 7296 + }, + { + "epoch": 0.40161814078925645, + "grad_norm": 0.9940280914306641, + "learning_rate": 9.057938146258057e-06, + "loss": 0.8247, + "step": 7297 + }, + { + "epoch": 0.4016731795916121, + "grad_norm": 0.7567923069000244, + "learning_rate": 9.05768488650067e-06, + "loss": 0.8254, + "step": 7298 + }, + { + "epoch": 0.4017282183939678, + "grad_norm": 0.7544496655464172, + "learning_rate": 9.05743159624681e-06, + "loss": 0.811, + "step": 7299 + }, + { + "epoch": 0.4017832571963234, + "grad_norm": 0.63368821144104, + "learning_rate": 9.05717827549838e-06, + "loss": 0.6498, + "step": 7300 + }, + { + "epoch": 0.4018382959986791, + "grad_norm": 0.7077621221542358, + "learning_rate": 9.056924924257284e-06, + "loss": 0.7401, + "step": 7301 + }, + { + "epoch": 0.40189333480103473, + "grad_norm": 0.6782366037368774, + "learning_rate": 9.056671542525426e-06, + "loss": 0.8013, + "step": 7302 + }, + { + "epoch": 0.40194837360339036, + "grad_norm": 0.6605678200721741, + "learning_rate": 9.056418130304709e-06, + "loss": 0.8038, + "step": 7303 + }, + { + "epoch": 0.40200341240574605, + "grad_norm": 0.8716840147972107, + "learning_rate": 9.056164687597041e-06, + "loss": 0.7652, + "step": 7304 + }, + { + "epoch": 0.4020584512081017, + "grad_norm": 0.8464542031288147, + "learning_rate": 9.055911214404325e-06, + "loss": 0.8663, + "step": 7305 + }, + { + "epoch": 0.4021134900104574, + "grad_norm": 0.7165409326553345, + "learning_rate": 9.055657710728466e-06, + "loss": 0.8028, + "step": 7306 + }, + { + "epoch": 0.402168528812813, + "grad_norm": 0.7313430309295654, + "learning_rate": 9.055404176571369e-06, + "loss": 0.7538, + "step": 7307 + }, + { + "epoch": 0.4022235676151687, + "grad_norm": 0.7757230401039124, + "learning_rate": 9.05515061193494e-06, + "loss": 0.9096, + "step": 7308 + }, + { + "epoch": 0.40227860641752433, + "grad_norm": 0.7178354859352112, + "learning_rate": 9.054897016821085e-06, + "loss": 0.7186, + "step": 7309 + }, + { + "epoch": 0.40233364521988, + "grad_norm": 0.8331356048583984, + "learning_rate": 9.054643391231708e-06, + "loss": 0.8724, + "step": 7310 + }, + { + "epoch": 0.40238868402223565, + "grad_norm": 0.7709757685661316, + "learning_rate": 9.054389735168717e-06, + "loss": 0.692, + "step": 7311 + }, + { + "epoch": 0.40244372282459134, + "grad_norm": 0.7393380999565125, + "learning_rate": 9.054136048634018e-06, + "loss": 0.7863, + "step": 7312 + }, + { + "epoch": 0.402498761626947, + "grad_norm": 0.7372385859489441, + "learning_rate": 9.053882331629518e-06, + "loss": 0.781, + "step": 7313 + }, + { + "epoch": 0.40255380042930267, + "grad_norm": 0.7076019048690796, + "learning_rate": 9.053628584157123e-06, + "loss": 0.7598, + "step": 7314 + }, + { + "epoch": 0.4026088392316583, + "grad_norm": 0.7465673685073853, + "learning_rate": 9.053374806218742e-06, + "loss": 0.7454, + "step": 7315 + }, + { + "epoch": 0.402663878034014, + "grad_norm": 0.7414120435714722, + "learning_rate": 9.05312099781628e-06, + "loss": 0.7135, + "step": 7316 + }, + { + "epoch": 0.4027189168363696, + "grad_norm": 0.7490748167037964, + "learning_rate": 9.052867158951646e-06, + "loss": 0.6833, + "step": 7317 + }, + { + "epoch": 0.4027739556387253, + "grad_norm": 0.8027878999710083, + "learning_rate": 9.052613289626747e-06, + "loss": 0.7466, + "step": 7318 + }, + { + "epoch": 0.40282899444108095, + "grad_norm": 0.6777862310409546, + "learning_rate": 9.052359389843493e-06, + "loss": 0.7446, + "step": 7319 + }, + { + "epoch": 0.40288403324343663, + "grad_norm": 0.9240381717681885, + "learning_rate": 9.052105459603787e-06, + "loss": 0.7801, + "step": 7320 + }, + { + "epoch": 0.40293907204579227, + "grad_norm": 0.9592602252960205, + "learning_rate": 9.051851498909543e-06, + "loss": 0.9648, + "step": 7321 + }, + { + "epoch": 0.40299411084814796, + "grad_norm": 0.8469638228416443, + "learning_rate": 9.051597507762669e-06, + "loss": 0.8303, + "step": 7322 + }, + { + "epoch": 0.4030491496505036, + "grad_norm": 0.6981443166732788, + "learning_rate": 9.05134348616507e-06, + "loss": 0.7245, + "step": 7323 + }, + { + "epoch": 0.4031041884528593, + "grad_norm": 0.7133469581604004, + "learning_rate": 9.05108943411866e-06, + "loss": 0.7763, + "step": 7324 + }, + { + "epoch": 0.4031592272552149, + "grad_norm": 0.7043703198432922, + "learning_rate": 9.050835351625344e-06, + "loss": 0.8247, + "step": 7325 + }, + { + "epoch": 0.4032142660575706, + "grad_norm": 0.6662501692771912, + "learning_rate": 9.050581238687036e-06, + "loss": 0.7669, + "step": 7326 + }, + { + "epoch": 0.40326930485992624, + "grad_norm": 0.6482356786727905, + "learning_rate": 9.050327095305643e-06, + "loss": 0.6477, + "step": 7327 + }, + { + "epoch": 0.4033243436622819, + "grad_norm": 0.7465450167655945, + "learning_rate": 9.050072921483076e-06, + "loss": 0.8053, + "step": 7328 + }, + { + "epoch": 0.40337938246463756, + "grad_norm": 0.6765472292900085, + "learning_rate": 9.049818717221245e-06, + "loss": 0.765, + "step": 7329 + }, + { + "epoch": 0.40343442126699325, + "grad_norm": 0.7098689675331116, + "learning_rate": 9.04956448252206e-06, + "loss": 0.8059, + "step": 7330 + }, + { + "epoch": 0.4034894600693489, + "grad_norm": 0.6773823499679565, + "learning_rate": 9.049310217387432e-06, + "loss": 0.6848, + "step": 7331 + }, + { + "epoch": 0.40354449887170457, + "grad_norm": 0.6884829998016357, + "learning_rate": 9.049055921819275e-06, + "loss": 0.696, + "step": 7332 + }, + { + "epoch": 0.4035995376740602, + "grad_norm": 0.662545919418335, + "learning_rate": 9.048801595819494e-06, + "loss": 0.8286, + "step": 7333 + }, + { + "epoch": 0.4036545764764159, + "grad_norm": 0.6863077878952026, + "learning_rate": 9.048547239390007e-06, + "loss": 0.7215, + "step": 7334 + }, + { + "epoch": 0.4037096152787715, + "grad_norm": 0.6982632875442505, + "learning_rate": 9.048292852532721e-06, + "loss": 0.7635, + "step": 7335 + }, + { + "epoch": 0.4037646540811272, + "grad_norm": 0.8512400984764099, + "learning_rate": 9.048038435249548e-06, + "loss": 0.6226, + "step": 7336 + }, + { + "epoch": 0.40381969288348285, + "grad_norm": 0.6952843070030212, + "learning_rate": 9.047783987542405e-06, + "loss": 0.8317, + "step": 7337 + }, + { + "epoch": 0.40387473168583854, + "grad_norm": 0.7802778482437134, + "learning_rate": 9.0475295094132e-06, + "loss": 0.8615, + "step": 7338 + }, + { + "epoch": 0.4039297704881942, + "grad_norm": 0.8783930540084839, + "learning_rate": 9.047275000863844e-06, + "loss": 0.743, + "step": 7339 + }, + { + "epoch": 0.40398480929054986, + "grad_norm": 0.7205806970596313, + "learning_rate": 9.047020461896256e-06, + "loss": 0.7953, + "step": 7340 + }, + { + "epoch": 0.4040398480929055, + "grad_norm": 0.8438451290130615, + "learning_rate": 9.046765892512344e-06, + "loss": 0.7613, + "step": 7341 + }, + { + "epoch": 0.4040948868952612, + "grad_norm": 0.7300973534584045, + "learning_rate": 9.046511292714021e-06, + "loss": 0.7856, + "step": 7342 + }, + { + "epoch": 0.4041499256976168, + "grad_norm": 0.8472041487693787, + "learning_rate": 9.046256662503206e-06, + "loss": 0.8526, + "step": 7343 + }, + { + "epoch": 0.4042049644999725, + "grad_norm": 0.789465606212616, + "learning_rate": 9.046002001881807e-06, + "loss": 0.7792, + "step": 7344 + }, + { + "epoch": 0.40426000330232814, + "grad_norm": 0.7720938920974731, + "learning_rate": 9.04574731085174e-06, + "loss": 0.8065, + "step": 7345 + }, + { + "epoch": 0.4043150421046838, + "grad_norm": 0.6968526840209961, + "learning_rate": 9.04549258941492e-06, + "loss": 0.8135, + "step": 7346 + }, + { + "epoch": 0.40437008090703946, + "grad_norm": 0.746865451335907, + "learning_rate": 9.04523783757326e-06, + "loss": 0.8216, + "step": 7347 + }, + { + "epoch": 0.4044251197093951, + "grad_norm": 0.6750560998916626, + "learning_rate": 9.044983055328676e-06, + "loss": 0.7883, + "step": 7348 + }, + { + "epoch": 0.4044801585117508, + "grad_norm": 0.6791195273399353, + "learning_rate": 9.044728242683081e-06, + "loss": 0.7721, + "step": 7349 + }, + { + "epoch": 0.4045351973141064, + "grad_norm": 0.7238358855247498, + "learning_rate": 9.044473399638392e-06, + "loss": 0.739, + "step": 7350 + }, + { + "epoch": 0.4045902361164621, + "grad_norm": 0.6793557405471802, + "learning_rate": 9.044218526196523e-06, + "loss": 0.7853, + "step": 7351 + }, + { + "epoch": 0.40464527491881774, + "grad_norm": 0.767564058303833, + "learning_rate": 9.043963622359392e-06, + "loss": 0.8158, + "step": 7352 + }, + { + "epoch": 0.40470031372117343, + "grad_norm": 0.6800708770751953, + "learning_rate": 9.043708688128909e-06, + "loss": 0.7493, + "step": 7353 + }, + { + "epoch": 0.40475535252352907, + "grad_norm": 0.75978022813797, + "learning_rate": 9.043453723506996e-06, + "loss": 0.7066, + "step": 7354 + }, + { + "epoch": 0.40481039132588476, + "grad_norm": 1.0194984674453735, + "learning_rate": 9.043198728495568e-06, + "loss": 0.6238, + "step": 7355 + }, + { + "epoch": 0.4048654301282404, + "grad_norm": 0.7102386355400085, + "learning_rate": 9.04294370309654e-06, + "loss": 0.75, + "step": 7356 + }, + { + "epoch": 0.4049204689305961, + "grad_norm": 0.8468191623687744, + "learning_rate": 9.04268864731183e-06, + "loss": 0.8095, + "step": 7357 + }, + { + "epoch": 0.4049755077329517, + "grad_norm": 0.7022871971130371, + "learning_rate": 9.042433561143353e-06, + "loss": 0.8394, + "step": 7358 + }, + { + "epoch": 0.4050305465353074, + "grad_norm": 1.1873482465744019, + "learning_rate": 9.042178444593028e-06, + "loss": 0.7863, + "step": 7359 + }, + { + "epoch": 0.40508558533766303, + "grad_norm": 0.7074940204620361, + "learning_rate": 9.041923297662772e-06, + "loss": 0.7067, + "step": 7360 + }, + { + "epoch": 0.4051406241400187, + "grad_norm": 0.7602211833000183, + "learning_rate": 9.041668120354503e-06, + "loss": 0.6594, + "step": 7361 + }, + { + "epoch": 0.40519566294237436, + "grad_norm": 0.7903324365615845, + "learning_rate": 9.041412912670138e-06, + "loss": 0.7978, + "step": 7362 + }, + { + "epoch": 0.40525070174473005, + "grad_norm": 0.7422891855239868, + "learning_rate": 9.041157674611595e-06, + "loss": 0.8162, + "step": 7363 + }, + { + "epoch": 0.4053057405470857, + "grad_norm": 0.7978767156600952, + "learning_rate": 9.040902406180791e-06, + "loss": 0.762, + "step": 7364 + }, + { + "epoch": 0.40536077934944137, + "grad_norm": 0.7719776630401611, + "learning_rate": 9.04064710737965e-06, + "loss": 0.8098, + "step": 7365 + }, + { + "epoch": 0.405415818151797, + "grad_norm": 0.8646591305732727, + "learning_rate": 9.040391778210083e-06, + "loss": 0.9372, + "step": 7366 + }, + { + "epoch": 0.4054708569541527, + "grad_norm": 0.6616937518119812, + "learning_rate": 9.040136418674015e-06, + "loss": 0.7424, + "step": 7367 + }, + { + "epoch": 0.4055258957565083, + "grad_norm": 0.7676553130149841, + "learning_rate": 9.039881028773363e-06, + "loss": 0.6327, + "step": 7368 + }, + { + "epoch": 0.405580934558864, + "grad_norm": 0.6838239431381226, + "learning_rate": 9.039625608510047e-06, + "loss": 0.7548, + "step": 7369 + }, + { + "epoch": 0.40563597336121965, + "grad_norm": 0.7476304769515991, + "learning_rate": 9.039370157885986e-06, + "loss": 0.7262, + "step": 7370 + }, + { + "epoch": 0.40569101216357534, + "grad_norm": 0.8985139727592468, + "learning_rate": 9.0391146769031e-06, + "loss": 0.7729, + "step": 7371 + }, + { + "epoch": 0.40574605096593097, + "grad_norm": 0.7840422987937927, + "learning_rate": 9.038859165563308e-06, + "loss": 0.7855, + "step": 7372 + }, + { + "epoch": 0.40580108976828666, + "grad_norm": 0.6777672171592712, + "learning_rate": 9.038603623868534e-06, + "loss": 0.7379, + "step": 7373 + }, + { + "epoch": 0.4058561285706423, + "grad_norm": 0.7226746678352356, + "learning_rate": 9.038348051820694e-06, + "loss": 0.7686, + "step": 7374 + }, + { + "epoch": 0.405911167372998, + "grad_norm": 0.7647444605827332, + "learning_rate": 9.038092449421713e-06, + "loss": 0.8859, + "step": 7375 + }, + { + "epoch": 0.4059662061753536, + "grad_norm": 0.6524979472160339, + "learning_rate": 9.037836816673508e-06, + "loss": 0.6982, + "step": 7376 + }, + { + "epoch": 0.4060212449777093, + "grad_norm": 0.7842861413955688, + "learning_rate": 9.037581153578004e-06, + "loss": 0.8099, + "step": 7377 + }, + { + "epoch": 0.40607628378006494, + "grad_norm": 0.6424387693405151, + "learning_rate": 9.03732546013712e-06, + "loss": 0.7387, + "step": 7378 + }, + { + "epoch": 0.40613132258242063, + "grad_norm": 0.8444356918334961, + "learning_rate": 9.037069736352779e-06, + "loss": 0.8813, + "step": 7379 + }, + { + "epoch": 0.40618636138477626, + "grad_norm": 0.6487529277801514, + "learning_rate": 9.036813982226904e-06, + "loss": 0.7609, + "step": 7380 + }, + { + "epoch": 0.40624140018713195, + "grad_norm": 0.7891185879707336, + "learning_rate": 9.036558197761413e-06, + "loss": 0.8589, + "step": 7381 + }, + { + "epoch": 0.4062964389894876, + "grad_norm": 0.7183120250701904, + "learning_rate": 9.036302382958233e-06, + "loss": 0.8429, + "step": 7382 + }, + { + "epoch": 0.4063514777918433, + "grad_norm": 0.6386578679084778, + "learning_rate": 9.036046537819283e-06, + "loss": 0.6955, + "step": 7383 + }, + { + "epoch": 0.4064065165941989, + "grad_norm": 0.7572369575500488, + "learning_rate": 9.035790662346488e-06, + "loss": 0.8018, + "step": 7384 + }, + { + "epoch": 0.4064615553965546, + "grad_norm": 0.7105650305747986, + "learning_rate": 9.035534756541771e-06, + "loss": 0.8527, + "step": 7385 + }, + { + "epoch": 0.40651659419891023, + "grad_norm": 0.7031856179237366, + "learning_rate": 9.035278820407056e-06, + "loss": 0.6991, + "step": 7386 + }, + { + "epoch": 0.4065716330012659, + "grad_norm": 0.7407381534576416, + "learning_rate": 9.035022853944266e-06, + "loss": 0.708, + "step": 7387 + }, + { + "epoch": 0.40662667180362155, + "grad_norm": 0.7078498601913452, + "learning_rate": 9.034766857155322e-06, + "loss": 0.7584, + "step": 7388 + }, + { + "epoch": 0.4066817106059772, + "grad_norm": 0.7643301486968994, + "learning_rate": 9.034510830042151e-06, + "loss": 0.7836, + "step": 7389 + }, + { + "epoch": 0.4067367494083329, + "grad_norm": 0.7165302038192749, + "learning_rate": 9.034254772606676e-06, + "loss": 0.7769, + "step": 7390 + }, + { + "epoch": 0.4067917882106885, + "grad_norm": 0.7442395091056824, + "learning_rate": 9.033998684850824e-06, + "loss": 0.7231, + "step": 7391 + }, + { + "epoch": 0.4068468270130442, + "grad_norm": 0.7425046563148499, + "learning_rate": 9.033742566776517e-06, + "loss": 0.7709, + "step": 7392 + }, + { + "epoch": 0.40690186581539983, + "grad_norm": 0.768419086933136, + "learning_rate": 9.03348641838568e-06, + "loss": 0.7768, + "step": 7393 + }, + { + "epoch": 0.4069569046177555, + "grad_norm": 0.6785634160041809, + "learning_rate": 9.03323023968024e-06, + "loss": 0.7468, + "step": 7394 + }, + { + "epoch": 0.40701194342011116, + "grad_norm": 0.7075444459915161, + "learning_rate": 9.03297403066212e-06, + "loss": 0.7757, + "step": 7395 + }, + { + "epoch": 0.40706698222246684, + "grad_norm": 0.7580223679542542, + "learning_rate": 9.032717791333247e-06, + "loss": 0.7311, + "step": 7396 + }, + { + "epoch": 0.4071220210248225, + "grad_norm": 0.8110041618347168, + "learning_rate": 9.032461521695546e-06, + "loss": 0.7923, + "step": 7397 + }, + { + "epoch": 0.40717705982717817, + "grad_norm": 0.7204881310462952, + "learning_rate": 9.032205221750945e-06, + "loss": 0.759, + "step": 7398 + }, + { + "epoch": 0.4072320986295338, + "grad_norm": 0.8392491340637207, + "learning_rate": 9.031948891501368e-06, + "loss": 0.8292, + "step": 7399 + }, + { + "epoch": 0.4072871374318895, + "grad_norm": 0.7134600281715393, + "learning_rate": 9.031692530948742e-06, + "loss": 0.7, + "step": 7400 + }, + { + "epoch": 0.4073421762342451, + "grad_norm": 0.6324336528778076, + "learning_rate": 9.031436140094995e-06, + "loss": 0.6964, + "step": 7401 + }, + { + "epoch": 0.4073972150366008, + "grad_norm": 0.7281947731971741, + "learning_rate": 9.031179718942052e-06, + "loss": 0.7567, + "step": 7402 + }, + { + "epoch": 0.40745225383895645, + "grad_norm": 0.8828619718551636, + "learning_rate": 9.030923267491842e-06, + "loss": 0.8139, + "step": 7403 + }, + { + "epoch": 0.40750729264131214, + "grad_norm": 0.7039986252784729, + "learning_rate": 9.030666785746292e-06, + "loss": 0.7339, + "step": 7404 + }, + { + "epoch": 0.40756233144366777, + "grad_norm": 0.7049984931945801, + "learning_rate": 9.030410273707331e-06, + "loss": 0.6842, + "step": 7405 + }, + { + "epoch": 0.40761737024602346, + "grad_norm": 0.7149737477302551, + "learning_rate": 9.030153731376883e-06, + "loss": 0.6837, + "step": 7406 + }, + { + "epoch": 0.4076724090483791, + "grad_norm": 1.0804089307785034, + "learning_rate": 9.029897158756878e-06, + "loss": 0.7726, + "step": 7407 + }, + { + "epoch": 0.4077274478507348, + "grad_norm": 0.8354909420013428, + "learning_rate": 9.029640555849244e-06, + "loss": 0.8058, + "step": 7408 + }, + { + "epoch": 0.4077824866530904, + "grad_norm": 0.7091527581214905, + "learning_rate": 9.029383922655914e-06, + "loss": 0.7636, + "step": 7409 + }, + { + "epoch": 0.4078375254554461, + "grad_norm": 0.6720988750457764, + "learning_rate": 9.029127259178809e-06, + "loss": 0.7179, + "step": 7410 + }, + { + "epoch": 0.40789256425780174, + "grad_norm": 0.685858964920044, + "learning_rate": 9.028870565419865e-06, + "loss": 0.7637, + "step": 7411 + }, + { + "epoch": 0.4079476030601574, + "grad_norm": 0.7505033016204834, + "learning_rate": 9.028613841381007e-06, + "loss": 0.7463, + "step": 7412 + }, + { + "epoch": 0.40800264186251306, + "grad_norm": 0.8801671862602234, + "learning_rate": 9.028357087064166e-06, + "loss": 0.8399, + "step": 7413 + }, + { + "epoch": 0.40805768066486875, + "grad_norm": 0.7441918849945068, + "learning_rate": 9.02810030247127e-06, + "loss": 0.7689, + "step": 7414 + }, + { + "epoch": 0.4081127194672244, + "grad_norm": 0.7410128712654114, + "learning_rate": 9.027843487604251e-06, + "loss": 0.8013, + "step": 7415 + }, + { + "epoch": 0.40816775826958007, + "grad_norm": 0.8075226545333862, + "learning_rate": 9.02758664246504e-06, + "loss": 0.7717, + "step": 7416 + }, + { + "epoch": 0.4082227970719357, + "grad_norm": 0.7985545992851257, + "learning_rate": 9.027329767055566e-06, + "loss": 0.8459, + "step": 7417 + }, + { + "epoch": 0.4082778358742914, + "grad_norm": 0.7887235283851624, + "learning_rate": 9.027072861377757e-06, + "loss": 0.8201, + "step": 7418 + }, + { + "epoch": 0.40833287467664703, + "grad_norm": 0.7876266241073608, + "learning_rate": 9.02681592543355e-06, + "loss": 0.8205, + "step": 7419 + }, + { + "epoch": 0.4083879134790027, + "grad_norm": 0.758168637752533, + "learning_rate": 9.02655895922487e-06, + "loss": 0.6619, + "step": 7420 + }, + { + "epoch": 0.40844295228135835, + "grad_norm": 0.7279811501502991, + "learning_rate": 9.02630196275365e-06, + "loss": 0.7634, + "step": 7421 + }, + { + "epoch": 0.40849799108371404, + "grad_norm": 0.7540523409843445, + "learning_rate": 9.026044936021822e-06, + "loss": 0.7819, + "step": 7422 + }, + { + "epoch": 0.4085530298860697, + "grad_norm": 0.8091018795967102, + "learning_rate": 9.02578787903132e-06, + "loss": 0.7749, + "step": 7423 + }, + { + "epoch": 0.40860806868842536, + "grad_norm": 0.7625396847724915, + "learning_rate": 9.025530791784074e-06, + "loss": 0.7635, + "step": 7424 + }, + { + "epoch": 0.408663107490781, + "grad_norm": 0.7663947939872742, + "learning_rate": 9.025273674282015e-06, + "loss": 0.8281, + "step": 7425 + }, + { + "epoch": 0.4087181462931367, + "grad_norm": 0.6672662496566772, + "learning_rate": 9.025016526527077e-06, + "loss": 0.641, + "step": 7426 + }, + { + "epoch": 0.4087731850954923, + "grad_norm": 0.7649143934249878, + "learning_rate": 9.024759348521193e-06, + "loss": 0.7462, + "step": 7427 + }, + { + "epoch": 0.408828223897848, + "grad_norm": 0.7540067434310913, + "learning_rate": 9.024502140266293e-06, + "loss": 0.8756, + "step": 7428 + }, + { + "epoch": 0.40888326270020364, + "grad_norm": 0.721615731716156, + "learning_rate": 9.024244901764314e-06, + "loss": 0.8507, + "step": 7429 + }, + { + "epoch": 0.40893830150255933, + "grad_norm": 0.6949496269226074, + "learning_rate": 9.023987633017186e-06, + "loss": 0.7021, + "step": 7430 + }, + { + "epoch": 0.40899334030491497, + "grad_norm": 0.7108990550041199, + "learning_rate": 9.023730334026845e-06, + "loss": 0.807, + "step": 7431 + }, + { + "epoch": 0.4090483791072706, + "grad_norm": 0.7606124877929688, + "learning_rate": 9.023473004795225e-06, + "loss": 0.7769, + "step": 7432 + }, + { + "epoch": 0.4091034179096263, + "grad_norm": 0.7792031764984131, + "learning_rate": 9.023215645324256e-06, + "loss": 0.728, + "step": 7433 + }, + { + "epoch": 0.4091584567119819, + "grad_norm": 0.728884756565094, + "learning_rate": 9.022958255615877e-06, + "loss": 0.7831, + "step": 7434 + }, + { + "epoch": 0.4092134955143376, + "grad_norm": 0.8196625709533691, + "learning_rate": 9.022700835672022e-06, + "loss": 0.8265, + "step": 7435 + }, + { + "epoch": 0.40926853431669324, + "grad_norm": 0.762734055519104, + "learning_rate": 9.022443385494621e-06, + "loss": 0.8028, + "step": 7436 + }, + { + "epoch": 0.40932357311904893, + "grad_norm": 0.7259558439254761, + "learning_rate": 9.022185905085614e-06, + "loss": 0.789, + "step": 7437 + }, + { + "epoch": 0.40937861192140457, + "grad_norm": 0.7402371764183044, + "learning_rate": 9.021928394446936e-06, + "loss": 0.7667, + "step": 7438 + }, + { + "epoch": 0.40943365072376026, + "grad_norm": 0.8399797677993774, + "learning_rate": 9.021670853580519e-06, + "loss": 0.8451, + "step": 7439 + }, + { + "epoch": 0.4094886895261159, + "grad_norm": 0.6439585089683533, + "learning_rate": 9.0214132824883e-06, + "loss": 0.776, + "step": 7440 + }, + { + "epoch": 0.4095437283284716, + "grad_norm": 0.6956612467765808, + "learning_rate": 9.021155681172215e-06, + "loss": 0.6921, + "step": 7441 + }, + { + "epoch": 0.4095987671308272, + "grad_norm": 0.855413556098938, + "learning_rate": 9.020898049634203e-06, + "loss": 0.8552, + "step": 7442 + }, + { + "epoch": 0.4096538059331829, + "grad_norm": 0.6690535545349121, + "learning_rate": 9.020640387876194e-06, + "loss": 0.7552, + "step": 7443 + }, + { + "epoch": 0.40970884473553854, + "grad_norm": 0.6615462899208069, + "learning_rate": 9.020382695900131e-06, + "loss": 0.8216, + "step": 7444 + }, + { + "epoch": 0.4097638835378942, + "grad_norm": 0.6975858211517334, + "learning_rate": 9.020124973707947e-06, + "loss": 0.7453, + "step": 7445 + }, + { + "epoch": 0.40981892234024986, + "grad_norm": 0.6461964249610901, + "learning_rate": 9.019867221301579e-06, + "loss": 0.656, + "step": 7446 + }, + { + "epoch": 0.40987396114260555, + "grad_norm": 0.7221645712852478, + "learning_rate": 9.019609438682967e-06, + "loss": 0.661, + "step": 7447 + }, + { + "epoch": 0.4099289999449612, + "grad_norm": 0.6785755753517151, + "learning_rate": 9.019351625854044e-06, + "loss": 0.7294, + "step": 7448 + }, + { + "epoch": 0.40998403874731687, + "grad_norm": 0.7040538787841797, + "learning_rate": 9.019093782816751e-06, + "loss": 0.8546, + "step": 7449 + }, + { + "epoch": 0.4100390775496725, + "grad_norm": 0.737922191619873, + "learning_rate": 9.018835909573025e-06, + "loss": 0.8144, + "step": 7450 + }, + { + "epoch": 0.4100941163520282, + "grad_norm": 0.6705496311187744, + "learning_rate": 9.018578006124802e-06, + "loss": 0.6937, + "step": 7451 + }, + { + "epoch": 0.4101491551543838, + "grad_norm": 0.7347431182861328, + "learning_rate": 9.018320072474026e-06, + "loss": 0.7716, + "step": 7452 + }, + { + "epoch": 0.4102041939567395, + "grad_norm": 0.7023493647575378, + "learning_rate": 9.018062108622631e-06, + "loss": 0.7295, + "step": 7453 + }, + { + "epoch": 0.41025923275909515, + "grad_norm": 0.8017870187759399, + "learning_rate": 9.017804114572556e-06, + "loss": 0.7471, + "step": 7454 + }, + { + "epoch": 0.41031427156145084, + "grad_norm": 0.9171211123466492, + "learning_rate": 9.01754609032574e-06, + "loss": 0.8262, + "step": 7455 + }, + { + "epoch": 0.41036931036380647, + "grad_norm": 0.6682952046394348, + "learning_rate": 9.017288035884124e-06, + "loss": 0.7165, + "step": 7456 + }, + { + "epoch": 0.41042434916616216, + "grad_norm": 0.9339122772216797, + "learning_rate": 9.017029951249648e-06, + "loss": 0.8618, + "step": 7457 + }, + { + "epoch": 0.4104793879685178, + "grad_norm": 0.7063136696815491, + "learning_rate": 9.016771836424248e-06, + "loss": 0.8068, + "step": 7458 + }, + { + "epoch": 0.4105344267708735, + "grad_norm": 0.6717063784599304, + "learning_rate": 9.016513691409867e-06, + "loss": 0.738, + "step": 7459 + }, + { + "epoch": 0.4105894655732291, + "grad_norm": 0.6807749271392822, + "learning_rate": 9.016255516208443e-06, + "loss": 0.7842, + "step": 7460 + }, + { + "epoch": 0.4106445043755848, + "grad_norm": 0.6990453600883484, + "learning_rate": 9.01599731082192e-06, + "loss": 0.7726, + "step": 7461 + }, + { + "epoch": 0.41069954317794044, + "grad_norm": 0.6704931259155273, + "learning_rate": 9.015739075252234e-06, + "loss": 0.7006, + "step": 7462 + }, + { + "epoch": 0.41075458198029613, + "grad_norm": 0.7162300944328308, + "learning_rate": 9.01548080950133e-06, + "loss": 0.8462, + "step": 7463 + }, + { + "epoch": 0.41080962078265176, + "grad_norm": 0.6845411658287048, + "learning_rate": 9.015222513571144e-06, + "loss": 0.7466, + "step": 7464 + }, + { + "epoch": 0.41086465958500745, + "grad_norm": 0.7146134376525879, + "learning_rate": 9.014964187463623e-06, + "loss": 0.7594, + "step": 7465 + }, + { + "epoch": 0.4109196983873631, + "grad_norm": 0.7664906978607178, + "learning_rate": 9.014705831180706e-06, + "loss": 0.8376, + "step": 7466 + }, + { + "epoch": 0.4109747371897188, + "grad_norm": 0.7319341897964478, + "learning_rate": 9.014447444724332e-06, + "loss": 0.7748, + "step": 7467 + }, + { + "epoch": 0.4110297759920744, + "grad_norm": 0.7269605398178101, + "learning_rate": 9.014189028096448e-06, + "loss": 0.6941, + "step": 7468 + }, + { + "epoch": 0.4110848147944301, + "grad_norm": 0.72607421875, + "learning_rate": 9.013930581298993e-06, + "loss": 0.7174, + "step": 7469 + }, + { + "epoch": 0.41113985359678573, + "grad_norm": 0.7385421991348267, + "learning_rate": 9.01367210433391e-06, + "loss": 0.7761, + "step": 7470 + }, + { + "epoch": 0.4111948923991414, + "grad_norm": 0.8392042517662048, + "learning_rate": 9.013413597203144e-06, + "loss": 0.7417, + "step": 7471 + }, + { + "epoch": 0.41124993120149705, + "grad_norm": 0.7454584836959839, + "learning_rate": 9.013155059908634e-06, + "loss": 0.8976, + "step": 7472 + }, + { + "epoch": 0.41130497000385274, + "grad_norm": 0.7358037829399109, + "learning_rate": 9.012896492452325e-06, + "loss": 0.7706, + "step": 7473 + }, + { + "epoch": 0.4113600088062084, + "grad_norm": 0.7454121708869934, + "learning_rate": 9.01263789483616e-06, + "loss": 0.7425, + "step": 7474 + }, + { + "epoch": 0.411415047608564, + "grad_norm": 0.7842294573783875, + "learning_rate": 9.012379267062081e-06, + "loss": 0.7739, + "step": 7475 + }, + { + "epoch": 0.4114700864109197, + "grad_norm": 0.7181714773178101, + "learning_rate": 9.012120609132036e-06, + "loss": 0.8466, + "step": 7476 + }, + { + "epoch": 0.41152512521327533, + "grad_norm": 0.7239206433296204, + "learning_rate": 9.011861921047966e-06, + "loss": 0.7493, + "step": 7477 + }, + { + "epoch": 0.411580164015631, + "grad_norm": 0.6773414611816406, + "learning_rate": 9.011603202811816e-06, + "loss": 0.7433, + "step": 7478 + }, + { + "epoch": 0.41163520281798666, + "grad_norm": 0.7770900130271912, + "learning_rate": 9.011344454425527e-06, + "loss": 0.7488, + "step": 7479 + }, + { + "epoch": 0.41169024162034235, + "grad_norm": 0.7305957674980164, + "learning_rate": 9.011085675891051e-06, + "loss": 0.7989, + "step": 7480 + }, + { + "epoch": 0.411745280422698, + "grad_norm": 0.734603762626648, + "learning_rate": 9.010826867210327e-06, + "loss": 0.805, + "step": 7481 + }, + { + "epoch": 0.41180031922505367, + "grad_norm": 0.7438979148864746, + "learning_rate": 9.010568028385303e-06, + "loss": 0.8407, + "step": 7482 + }, + { + "epoch": 0.4118553580274093, + "grad_norm": 0.6718543767929077, + "learning_rate": 9.01030915941792e-06, + "loss": 0.7575, + "step": 7483 + }, + { + "epoch": 0.411910396829765, + "grad_norm": 0.8157614469528198, + "learning_rate": 9.01005026031013e-06, + "loss": 0.8231, + "step": 7484 + }, + { + "epoch": 0.4119654356321206, + "grad_norm": 0.8927714824676514, + "learning_rate": 9.009791331063874e-06, + "loss": 0.808, + "step": 7485 + }, + { + "epoch": 0.4120204744344763, + "grad_norm": 0.7604075074195862, + "learning_rate": 9.009532371681101e-06, + "loss": 0.7505, + "step": 7486 + }, + { + "epoch": 0.41207551323683195, + "grad_norm": 0.6861944794654846, + "learning_rate": 9.009273382163754e-06, + "loss": 0.719, + "step": 7487 + }, + { + "epoch": 0.41213055203918764, + "grad_norm": 0.7043709754943848, + "learning_rate": 9.009014362513784e-06, + "loss": 0.8193, + "step": 7488 + }, + { + "epoch": 0.41218559084154327, + "grad_norm": 0.7459648847579956, + "learning_rate": 9.008755312733136e-06, + "loss": 0.8617, + "step": 7489 + }, + { + "epoch": 0.41224062964389896, + "grad_norm": 0.7272594571113586, + "learning_rate": 9.008496232823754e-06, + "loss": 0.7255, + "step": 7490 + }, + { + "epoch": 0.4122956684462546, + "grad_norm": 0.7486668229103088, + "learning_rate": 9.008237122787586e-06, + "loss": 0.6479, + "step": 7491 + }, + { + "epoch": 0.4123507072486103, + "grad_norm": 0.8149027228355408, + "learning_rate": 9.007977982626582e-06, + "loss": 0.8052, + "step": 7492 + }, + { + "epoch": 0.4124057460509659, + "grad_norm": 0.7054859399795532, + "learning_rate": 9.00771881234269e-06, + "loss": 0.8215, + "step": 7493 + }, + { + "epoch": 0.4124607848533216, + "grad_norm": 0.6840499639511108, + "learning_rate": 9.007459611937854e-06, + "loss": 0.776, + "step": 7494 + }, + { + "epoch": 0.41251582365567724, + "grad_norm": 0.7340932488441467, + "learning_rate": 9.007200381414026e-06, + "loss": 0.713, + "step": 7495 + }, + { + "epoch": 0.4125708624580329, + "grad_norm": 0.8282599449157715, + "learning_rate": 9.00694112077315e-06, + "loss": 0.7037, + "step": 7496 + }, + { + "epoch": 0.41262590126038856, + "grad_norm": 0.849588930606842, + "learning_rate": 9.00668183001718e-06, + "loss": 0.7845, + "step": 7497 + }, + { + "epoch": 0.41268094006274425, + "grad_norm": 0.8330783843994141, + "learning_rate": 9.00642250914806e-06, + "loss": 0.9049, + "step": 7498 + }, + { + "epoch": 0.4127359788650999, + "grad_norm": 0.7020101547241211, + "learning_rate": 9.00616315816774e-06, + "loss": 0.8146, + "step": 7499 + }, + { + "epoch": 0.4127910176674556, + "grad_norm": 0.7632037997245789, + "learning_rate": 9.005903777078173e-06, + "loss": 0.6629, + "step": 7500 + }, + { + "epoch": 0.4128460564698112, + "grad_norm": 0.7286840081214905, + "learning_rate": 9.005644365881304e-06, + "loss": 0.7795, + "step": 7501 + }, + { + "epoch": 0.4129010952721669, + "grad_norm": 0.710451066493988, + "learning_rate": 9.005384924579084e-06, + "loss": 0.7615, + "step": 7502 + }, + { + "epoch": 0.41295613407452253, + "grad_norm": 0.7657510042190552, + "learning_rate": 9.005125453173463e-06, + "loss": 0.8938, + "step": 7503 + }, + { + "epoch": 0.4130111728768782, + "grad_norm": 0.6978467702865601, + "learning_rate": 9.004865951666392e-06, + "loss": 0.7464, + "step": 7504 + }, + { + "epoch": 0.41306621167923385, + "grad_norm": 0.7028319835662842, + "learning_rate": 9.00460642005982e-06, + "loss": 0.7899, + "step": 7505 + }, + { + "epoch": 0.41312125048158954, + "grad_norm": 0.923951268196106, + "learning_rate": 9.004346858355698e-06, + "loss": 0.8851, + "step": 7506 + }, + { + "epoch": 0.4131762892839452, + "grad_norm": 0.7293704748153687, + "learning_rate": 9.004087266555978e-06, + "loss": 0.7594, + "step": 7507 + }, + { + "epoch": 0.41323132808630086, + "grad_norm": 0.7458868622779846, + "learning_rate": 9.003827644662608e-06, + "loss": 0.7538, + "step": 7508 + }, + { + "epoch": 0.4132863668886565, + "grad_norm": 0.6764113306999207, + "learning_rate": 9.003567992677543e-06, + "loss": 0.7303, + "step": 7509 + }, + { + "epoch": 0.4133414056910122, + "grad_norm": 0.7827350497245789, + "learning_rate": 9.003308310602732e-06, + "loss": 0.7708, + "step": 7510 + }, + { + "epoch": 0.4133964444933678, + "grad_norm": 0.7683281302452087, + "learning_rate": 9.003048598440127e-06, + "loss": 0.7971, + "step": 7511 + }, + { + "epoch": 0.4134514832957235, + "grad_norm": 0.8793813586235046, + "learning_rate": 9.002788856191679e-06, + "loss": 0.7434, + "step": 7512 + }, + { + "epoch": 0.41350652209807914, + "grad_norm": 0.6598063111305237, + "learning_rate": 9.002529083859343e-06, + "loss": 0.7082, + "step": 7513 + }, + { + "epoch": 0.41356156090043483, + "grad_norm": 0.8239839673042297, + "learning_rate": 9.002269281445071e-06, + "loss": 0.8457, + "step": 7514 + }, + { + "epoch": 0.41361659970279047, + "grad_norm": 0.7433123588562012, + "learning_rate": 9.002009448950812e-06, + "loss": 0.7399, + "step": 7515 + }, + { + "epoch": 0.41367163850514616, + "grad_norm": 0.8310487866401672, + "learning_rate": 9.001749586378524e-06, + "loss": 0.7482, + "step": 7516 + }, + { + "epoch": 0.4137266773075018, + "grad_norm": 0.7170824408531189, + "learning_rate": 9.001489693730155e-06, + "loss": 0.7856, + "step": 7517 + }, + { + "epoch": 0.4137817161098574, + "grad_norm": 0.9063520431518555, + "learning_rate": 9.00122977100766e-06, + "loss": 0.8623, + "step": 7518 + }, + { + "epoch": 0.4138367549122131, + "grad_norm": 0.8753733038902283, + "learning_rate": 9.000969818212996e-06, + "loss": 0.7875, + "step": 7519 + }, + { + "epoch": 0.41389179371456875, + "grad_norm": 0.7013519406318665, + "learning_rate": 9.000709835348112e-06, + "loss": 0.724, + "step": 7520 + }, + { + "epoch": 0.41394683251692443, + "grad_norm": 0.7385973334312439, + "learning_rate": 9.000449822414963e-06, + "loss": 0.7286, + "step": 7521 + }, + { + "epoch": 0.41400187131928007, + "grad_norm": 0.7605431079864502, + "learning_rate": 9.000189779415505e-06, + "loss": 0.728, + "step": 7522 + }, + { + "epoch": 0.41405691012163576, + "grad_norm": 0.7631710767745972, + "learning_rate": 8.99992970635169e-06, + "loss": 0.8276, + "step": 7523 + }, + { + "epoch": 0.4141119489239914, + "grad_norm": 0.8066657185554504, + "learning_rate": 8.999669603225477e-06, + "loss": 0.8319, + "step": 7524 + }, + { + "epoch": 0.4141669877263471, + "grad_norm": 0.689407229423523, + "learning_rate": 8.999409470038815e-06, + "loss": 0.6675, + "step": 7525 + }, + { + "epoch": 0.4142220265287027, + "grad_norm": 0.7391255497932434, + "learning_rate": 8.999149306793664e-06, + "loss": 0.8228, + "step": 7526 + }, + { + "epoch": 0.4142770653310584, + "grad_norm": 0.7208844423294067, + "learning_rate": 8.998889113491977e-06, + "loss": 0.7689, + "step": 7527 + }, + { + "epoch": 0.41433210413341404, + "grad_norm": 0.8278803825378418, + "learning_rate": 8.99862889013571e-06, + "loss": 0.7964, + "step": 7528 + }, + { + "epoch": 0.4143871429357697, + "grad_norm": 0.7287253141403198, + "learning_rate": 8.998368636726817e-06, + "loss": 0.7689, + "step": 7529 + }, + { + "epoch": 0.41444218173812536, + "grad_norm": 0.7159145474433899, + "learning_rate": 8.998108353267257e-06, + "loss": 0.7537, + "step": 7530 + }, + { + "epoch": 0.41449722054048105, + "grad_norm": 0.7605739235877991, + "learning_rate": 8.997848039758985e-06, + "loss": 0.7327, + "step": 7531 + }, + { + "epoch": 0.4145522593428367, + "grad_norm": 0.7290406227111816, + "learning_rate": 8.997587696203958e-06, + "loss": 0.6804, + "step": 7532 + }, + { + "epoch": 0.41460729814519237, + "grad_norm": 0.7613189816474915, + "learning_rate": 8.997327322604131e-06, + "loss": 0.7465, + "step": 7533 + }, + { + "epoch": 0.414662336947548, + "grad_norm": 0.7796703577041626, + "learning_rate": 8.99706691896146e-06, + "loss": 0.7444, + "step": 7534 + }, + { + "epoch": 0.4147173757499037, + "grad_norm": 0.8758549094200134, + "learning_rate": 8.996806485277904e-06, + "loss": 0.8586, + "step": 7535 + }, + { + "epoch": 0.4147724145522593, + "grad_norm": 0.9599420428276062, + "learning_rate": 8.996546021555423e-06, + "loss": 0.7554, + "step": 7536 + }, + { + "epoch": 0.414827453354615, + "grad_norm": 0.8216326236724854, + "learning_rate": 8.996285527795972e-06, + "loss": 0.7995, + "step": 7537 + }, + { + "epoch": 0.41488249215697065, + "grad_norm": 0.6777452230453491, + "learning_rate": 8.996025004001507e-06, + "loss": 0.7809, + "step": 7538 + }, + { + "epoch": 0.41493753095932634, + "grad_norm": 0.7354100942611694, + "learning_rate": 8.995764450173989e-06, + "loss": 0.6548, + "step": 7539 + }, + { + "epoch": 0.414992569761682, + "grad_norm": 0.7548280358314514, + "learning_rate": 8.995503866315373e-06, + "loss": 0.8308, + "step": 7540 + }, + { + "epoch": 0.41504760856403766, + "grad_norm": 0.6891447901725769, + "learning_rate": 8.995243252427622e-06, + "loss": 0.8386, + "step": 7541 + }, + { + "epoch": 0.4151026473663933, + "grad_norm": 0.6848340034484863, + "learning_rate": 8.99498260851269e-06, + "loss": 0.7587, + "step": 7542 + }, + { + "epoch": 0.415157686168749, + "grad_norm": 0.7109090685844421, + "learning_rate": 8.994721934572538e-06, + "loss": 0.6847, + "step": 7543 + }, + { + "epoch": 0.4152127249711046, + "grad_norm": 0.6708144545555115, + "learning_rate": 8.994461230609128e-06, + "loss": 0.7266, + "step": 7544 + }, + { + "epoch": 0.4152677637734603, + "grad_norm": 0.6985414028167725, + "learning_rate": 8.994200496624415e-06, + "loss": 0.7696, + "step": 7545 + }, + { + "epoch": 0.41532280257581594, + "grad_norm": 0.6989198923110962, + "learning_rate": 8.993939732620359e-06, + "loss": 0.7894, + "step": 7546 + }, + { + "epoch": 0.41537784137817163, + "grad_norm": 0.6667589545249939, + "learning_rate": 8.993678938598921e-06, + "loss": 0.7417, + "step": 7547 + }, + { + "epoch": 0.41543288018052726, + "grad_norm": 1.0692487955093384, + "learning_rate": 8.993418114562064e-06, + "loss": 0.7147, + "step": 7548 + }, + { + "epoch": 0.41548791898288295, + "grad_norm": 0.6709207892417908, + "learning_rate": 8.993157260511742e-06, + "loss": 0.7694, + "step": 7549 + }, + { + "epoch": 0.4155429577852386, + "grad_norm": 0.6714604496955872, + "learning_rate": 8.992896376449923e-06, + "loss": 0.6969, + "step": 7550 + }, + { + "epoch": 0.4155979965875943, + "grad_norm": 0.8266897201538086, + "learning_rate": 8.99263546237856e-06, + "loss": 0.8392, + "step": 7551 + }, + { + "epoch": 0.4156530353899499, + "grad_norm": 0.675188422203064, + "learning_rate": 8.992374518299619e-06, + "loss": 0.7525, + "step": 7552 + }, + { + "epoch": 0.4157080741923056, + "grad_norm": 0.7406265139579773, + "learning_rate": 8.992113544215059e-06, + "loss": 0.7895, + "step": 7553 + }, + { + "epoch": 0.41576311299466123, + "grad_norm": 0.837336003780365, + "learning_rate": 8.991852540126844e-06, + "loss": 0.7376, + "step": 7554 + }, + { + "epoch": 0.4158181517970169, + "grad_norm": 0.6774994730949402, + "learning_rate": 8.991591506036931e-06, + "loss": 0.7231, + "step": 7555 + }, + { + "epoch": 0.41587319059937256, + "grad_norm": 0.6941245794296265, + "learning_rate": 8.991330441947287e-06, + "loss": 0.7213, + "step": 7556 + }, + { + "epoch": 0.41592822940172824, + "grad_norm": 0.7588210105895996, + "learning_rate": 8.991069347859871e-06, + "loss": 0.7829, + "step": 7557 + }, + { + "epoch": 0.4159832682040839, + "grad_norm": 0.7580196857452393, + "learning_rate": 8.990808223776647e-06, + "loss": 0.7782, + "step": 7558 + }, + { + "epoch": 0.41603830700643957, + "grad_norm": 0.7597478032112122, + "learning_rate": 8.990547069699576e-06, + "loss": 0.7764, + "step": 7559 + }, + { + "epoch": 0.4160933458087952, + "grad_norm": 0.7950314283370972, + "learning_rate": 8.990285885630622e-06, + "loss": 0.7263, + "step": 7560 + }, + { + "epoch": 0.41614838461115083, + "grad_norm": 0.6962432265281677, + "learning_rate": 8.990024671571747e-06, + "loss": 0.6616, + "step": 7561 + }, + { + "epoch": 0.4162034234135065, + "grad_norm": 0.682816207408905, + "learning_rate": 8.989763427524915e-06, + "loss": 0.7862, + "step": 7562 + }, + { + "epoch": 0.41625846221586216, + "grad_norm": 0.686673104763031, + "learning_rate": 8.989502153492089e-06, + "loss": 0.8199, + "step": 7563 + }, + { + "epoch": 0.41631350101821785, + "grad_norm": 0.7954965233802795, + "learning_rate": 8.989240849475231e-06, + "loss": 0.8021, + "step": 7564 + }, + { + "epoch": 0.4163685398205735, + "grad_norm": 0.7516284584999084, + "learning_rate": 8.988979515476309e-06, + "loss": 0.7803, + "step": 7565 + }, + { + "epoch": 0.41642357862292917, + "grad_norm": 0.7148317694664001, + "learning_rate": 8.988718151497284e-06, + "loss": 0.7407, + "step": 7566 + }, + { + "epoch": 0.4164786174252848, + "grad_norm": 0.7898986339569092, + "learning_rate": 8.98845675754012e-06, + "loss": 0.8382, + "step": 7567 + }, + { + "epoch": 0.4165336562276405, + "grad_norm": 0.7014235854148865, + "learning_rate": 8.988195333606784e-06, + "loss": 0.7205, + "step": 7568 + }, + { + "epoch": 0.4165886950299961, + "grad_norm": 0.6520957350730896, + "learning_rate": 8.987933879699238e-06, + "loss": 0.7452, + "step": 7569 + }, + { + "epoch": 0.4166437338323518, + "grad_norm": 0.7462863922119141, + "learning_rate": 8.987672395819449e-06, + "loss": 0.7787, + "step": 7570 + }, + { + "epoch": 0.41669877263470745, + "grad_norm": 0.7366049885749817, + "learning_rate": 8.987410881969382e-06, + "loss": 0.7662, + "step": 7571 + }, + { + "epoch": 0.41675381143706314, + "grad_norm": 0.7732293009757996, + "learning_rate": 8.987149338151002e-06, + "loss": 0.8258, + "step": 7572 + }, + { + "epoch": 0.41680885023941877, + "grad_norm": 0.9309358596801758, + "learning_rate": 8.986887764366275e-06, + "loss": 0.6538, + "step": 7573 + }, + { + "epoch": 0.41686388904177446, + "grad_norm": 0.6976680755615234, + "learning_rate": 8.986626160617167e-06, + "loss": 0.7175, + "step": 7574 + }, + { + "epoch": 0.4169189278441301, + "grad_norm": 0.7541783452033997, + "learning_rate": 8.986364526905645e-06, + "loss": 0.8153, + "step": 7575 + }, + { + "epoch": 0.4169739666464858, + "grad_norm": 0.8968943357467651, + "learning_rate": 8.986102863233673e-06, + "loss": 0.7859, + "step": 7576 + }, + { + "epoch": 0.4170290054488414, + "grad_norm": 0.6910044550895691, + "learning_rate": 8.985841169603218e-06, + "loss": 0.8381, + "step": 7577 + }, + { + "epoch": 0.4170840442511971, + "grad_norm": 0.8944257497787476, + "learning_rate": 8.985579446016249e-06, + "loss": 0.7062, + "step": 7578 + }, + { + "epoch": 0.41713908305355274, + "grad_norm": 0.6665629744529724, + "learning_rate": 8.98531769247473e-06, + "loss": 0.7928, + "step": 7579 + }, + { + "epoch": 0.41719412185590843, + "grad_norm": 0.7642979621887207, + "learning_rate": 8.985055908980634e-06, + "loss": 0.8442, + "step": 7580 + }, + { + "epoch": 0.41724916065826406, + "grad_norm": 0.7575559020042419, + "learning_rate": 8.98479409553592e-06, + "loss": 0.795, + "step": 7581 + }, + { + "epoch": 0.41730419946061975, + "grad_norm": 0.6567206978797913, + "learning_rate": 8.984532252142563e-06, + "loss": 0.713, + "step": 7582 + }, + { + "epoch": 0.4173592382629754, + "grad_norm": 0.6677179336547852, + "learning_rate": 8.984270378802527e-06, + "loss": 0.8173, + "step": 7583 + }, + { + "epoch": 0.4174142770653311, + "grad_norm": 0.6846007704734802, + "learning_rate": 8.984008475517782e-06, + "loss": 0.7154, + "step": 7584 + }, + { + "epoch": 0.4174693158676867, + "grad_norm": 0.7758762836456299, + "learning_rate": 8.983746542290294e-06, + "loss": 0.8686, + "step": 7585 + }, + { + "epoch": 0.4175243546700424, + "grad_norm": 0.6850305199623108, + "learning_rate": 8.983484579122036e-06, + "loss": 0.7568, + "step": 7586 + }, + { + "epoch": 0.41757939347239803, + "grad_norm": 0.7165307998657227, + "learning_rate": 8.983222586014973e-06, + "loss": 0.7856, + "step": 7587 + }, + { + "epoch": 0.4176344322747537, + "grad_norm": 0.7747449278831482, + "learning_rate": 8.982960562971074e-06, + "loss": 0.8148, + "step": 7588 + }, + { + "epoch": 0.41768947107710935, + "grad_norm": 0.789235532283783, + "learning_rate": 8.982698509992311e-06, + "loss": 0.8021, + "step": 7589 + }, + { + "epoch": 0.41774450987946504, + "grad_norm": 0.664186954498291, + "learning_rate": 8.982436427080652e-06, + "loss": 0.7394, + "step": 7590 + }, + { + "epoch": 0.4177995486818207, + "grad_norm": 0.7045899033546448, + "learning_rate": 8.982174314238069e-06, + "loss": 0.7029, + "step": 7591 + }, + { + "epoch": 0.41785458748417637, + "grad_norm": 0.7569751739501953, + "learning_rate": 8.981912171466525e-06, + "loss": 0.6106, + "step": 7592 + }, + { + "epoch": 0.417909626286532, + "grad_norm": 0.7383938431739807, + "learning_rate": 8.981649998767998e-06, + "loss": 0.8163, + "step": 7593 + }, + { + "epoch": 0.4179646650888877, + "grad_norm": 0.7314342856407166, + "learning_rate": 8.981387796144456e-06, + "loss": 0.6847, + "step": 7594 + }, + { + "epoch": 0.4180197038912433, + "grad_norm": 0.7249840497970581, + "learning_rate": 8.981125563597867e-06, + "loss": 0.8025, + "step": 7595 + }, + { + "epoch": 0.418074742693599, + "grad_norm": 0.7260022759437561, + "learning_rate": 8.980863301130206e-06, + "loss": 0.7807, + "step": 7596 + }, + { + "epoch": 0.41812978149595464, + "grad_norm": 0.6249421834945679, + "learning_rate": 8.980601008743441e-06, + "loss": 0.6744, + "step": 7597 + }, + { + "epoch": 0.41818482029831033, + "grad_norm": 0.8132835626602173, + "learning_rate": 8.980338686439544e-06, + "loss": 0.7992, + "step": 7598 + }, + { + "epoch": 0.41823985910066597, + "grad_norm": 0.7279506921768188, + "learning_rate": 8.980076334220487e-06, + "loss": 0.8402, + "step": 7599 + }, + { + "epoch": 0.41829489790302166, + "grad_norm": 0.7168325781822205, + "learning_rate": 8.979813952088242e-06, + "loss": 0.9107, + "step": 7600 + }, + { + "epoch": 0.4183499367053773, + "grad_norm": 0.633661150932312, + "learning_rate": 8.97955154004478e-06, + "loss": 0.6328, + "step": 7601 + }, + { + "epoch": 0.418404975507733, + "grad_norm": 0.6770638227462769, + "learning_rate": 8.979289098092074e-06, + "loss": 0.7604, + "step": 7602 + }, + { + "epoch": 0.4184600143100886, + "grad_norm": 0.7589067816734314, + "learning_rate": 8.979026626232098e-06, + "loss": 0.7774, + "step": 7603 + }, + { + "epoch": 0.41851505311244425, + "grad_norm": 0.7116312980651855, + "learning_rate": 8.97876412446682e-06, + "loss": 0.8186, + "step": 7604 + }, + { + "epoch": 0.41857009191479994, + "grad_norm": 0.7369259595870972, + "learning_rate": 8.978501592798219e-06, + "loss": 0.6705, + "step": 7605 + }, + { + "epoch": 0.41862513071715557, + "grad_norm": 0.6201806664466858, + "learning_rate": 8.978239031228265e-06, + "loss": 0.7011, + "step": 7606 + }, + { + "epoch": 0.41868016951951126, + "grad_norm": 0.7652842998504639, + "learning_rate": 8.977976439758929e-06, + "loss": 0.8112, + "step": 7607 + }, + { + "epoch": 0.4187352083218669, + "grad_norm": 0.7214640974998474, + "learning_rate": 8.97771381839219e-06, + "loss": 0.767, + "step": 7608 + }, + { + "epoch": 0.4187902471242226, + "grad_norm": 0.8093706369400024, + "learning_rate": 8.977451167130015e-06, + "loss": 0.8112, + "step": 7609 + }, + { + "epoch": 0.4188452859265782, + "grad_norm": 0.7023005485534668, + "learning_rate": 8.977188485974382e-06, + "loss": 0.7678, + "step": 7610 + }, + { + "epoch": 0.4189003247289339, + "grad_norm": 0.8126183748245239, + "learning_rate": 8.976925774927267e-06, + "loss": 0.8207, + "step": 7611 + }, + { + "epoch": 0.41895536353128954, + "grad_norm": 0.9624595642089844, + "learning_rate": 8.976663033990643e-06, + "loss": 0.7853, + "step": 7612 + }, + { + "epoch": 0.4190104023336452, + "grad_norm": 0.7866421937942505, + "learning_rate": 8.976400263166483e-06, + "loss": 0.6319, + "step": 7613 + }, + { + "epoch": 0.41906544113600086, + "grad_norm": 0.7555810213088989, + "learning_rate": 8.976137462456762e-06, + "loss": 0.7781, + "step": 7614 + }, + { + "epoch": 0.41912047993835655, + "grad_norm": 0.7383303046226501, + "learning_rate": 8.975874631863457e-06, + "loss": 0.8152, + "step": 7615 + }, + { + "epoch": 0.4191755187407122, + "grad_norm": 0.7873355746269226, + "learning_rate": 8.975611771388542e-06, + "loss": 0.723, + "step": 7616 + }, + { + "epoch": 0.41923055754306787, + "grad_norm": 0.7265962362289429, + "learning_rate": 8.975348881033993e-06, + "loss": 0.8016, + "step": 7617 + }, + { + "epoch": 0.4192855963454235, + "grad_norm": 0.7074393033981323, + "learning_rate": 8.975085960801788e-06, + "loss": 0.7453, + "step": 7618 + }, + { + "epoch": 0.4193406351477792, + "grad_norm": 0.6975581049919128, + "learning_rate": 8.9748230106939e-06, + "loss": 0.6516, + "step": 7619 + }, + { + "epoch": 0.41939567395013483, + "grad_norm": 0.7730469107627869, + "learning_rate": 8.974560030712304e-06, + "loss": 0.7297, + "step": 7620 + }, + { + "epoch": 0.4194507127524905, + "grad_norm": 0.7289026379585266, + "learning_rate": 8.974297020858982e-06, + "loss": 0.7087, + "step": 7621 + }, + { + "epoch": 0.41950575155484615, + "grad_norm": 0.8029256463050842, + "learning_rate": 8.974033981135906e-06, + "loss": 0.7923, + "step": 7622 + }, + { + "epoch": 0.41956079035720184, + "grad_norm": 0.765312135219574, + "learning_rate": 8.973770911545055e-06, + "loss": 0.7824, + "step": 7623 + }, + { + "epoch": 0.4196158291595575, + "grad_norm": 0.7903861403465271, + "learning_rate": 8.973507812088404e-06, + "loss": 0.8207, + "step": 7624 + }, + { + "epoch": 0.41967086796191316, + "grad_norm": 0.6875497698783875, + "learning_rate": 8.973244682767934e-06, + "loss": 0.7972, + "step": 7625 + }, + { + "epoch": 0.4197259067642688, + "grad_norm": 0.7781878709793091, + "learning_rate": 8.972981523585617e-06, + "loss": 0.754, + "step": 7626 + }, + { + "epoch": 0.4197809455666245, + "grad_norm": 0.6495640873908997, + "learning_rate": 8.972718334543437e-06, + "loss": 0.6851, + "step": 7627 + }, + { + "epoch": 0.4198359843689801, + "grad_norm": 0.7610780596733093, + "learning_rate": 8.97245511564337e-06, + "loss": 0.8161, + "step": 7628 + }, + { + "epoch": 0.4198910231713358, + "grad_norm": 0.7764771580696106, + "learning_rate": 8.972191866887393e-06, + "loss": 0.8341, + "step": 7629 + }, + { + "epoch": 0.41994606197369144, + "grad_norm": 0.7709774374961853, + "learning_rate": 8.971928588277485e-06, + "loss": 0.765, + "step": 7630 + }, + { + "epoch": 0.42000110077604713, + "grad_norm": 0.8213009238243103, + "learning_rate": 8.971665279815625e-06, + "loss": 0.8971, + "step": 7631 + }, + { + "epoch": 0.42005613957840277, + "grad_norm": 0.7232406735420227, + "learning_rate": 8.971401941503792e-06, + "loss": 0.7919, + "step": 7632 + }, + { + "epoch": 0.42011117838075845, + "grad_norm": 0.7322028279304504, + "learning_rate": 8.971138573343964e-06, + "loss": 0.8167, + "step": 7633 + }, + { + "epoch": 0.4201662171831141, + "grad_norm": 0.7204442024230957, + "learning_rate": 8.970875175338123e-06, + "loss": 0.8152, + "step": 7634 + }, + { + "epoch": 0.4202212559854698, + "grad_norm": 0.7385342121124268, + "learning_rate": 8.970611747488246e-06, + "loss": 0.8204, + "step": 7635 + }, + { + "epoch": 0.4202762947878254, + "grad_norm": 0.758941113948822, + "learning_rate": 8.970348289796316e-06, + "loss": 0.8402, + "step": 7636 + }, + { + "epoch": 0.4203313335901811, + "grad_norm": 0.7331902384757996, + "learning_rate": 8.970084802264309e-06, + "loss": 0.7305, + "step": 7637 + }, + { + "epoch": 0.42038637239253673, + "grad_norm": 0.7822885513305664, + "learning_rate": 8.969821284894208e-06, + "loss": 0.8708, + "step": 7638 + }, + { + "epoch": 0.4204414111948924, + "grad_norm": 0.6625984311103821, + "learning_rate": 8.969557737687992e-06, + "loss": 0.7806, + "step": 7639 + }, + { + "epoch": 0.42049644999724806, + "grad_norm": 1.02848482131958, + "learning_rate": 8.969294160647645e-06, + "loss": 0.7176, + "step": 7640 + }, + { + "epoch": 0.42055148879960375, + "grad_norm": 0.7888724207878113, + "learning_rate": 8.969030553775144e-06, + "loss": 0.8326, + "step": 7641 + }, + { + "epoch": 0.4206065276019594, + "grad_norm": 0.7148883938789368, + "learning_rate": 8.968766917072472e-06, + "loss": 0.7405, + "step": 7642 + }, + { + "epoch": 0.42066156640431507, + "grad_norm": 0.6629698872566223, + "learning_rate": 8.96850325054161e-06, + "loss": 0.845, + "step": 7643 + }, + { + "epoch": 0.4207166052066707, + "grad_norm": 0.8414682149887085, + "learning_rate": 8.96823955418454e-06, + "loss": 1.3631, + "step": 7644 + }, + { + "epoch": 0.4207716440090264, + "grad_norm": 0.7105298638343811, + "learning_rate": 8.967975828003244e-06, + "loss": 0.6808, + "step": 7645 + }, + { + "epoch": 0.420826682811382, + "grad_norm": 0.7324852347373962, + "learning_rate": 8.967712071999703e-06, + "loss": 0.8237, + "step": 7646 + }, + { + "epoch": 0.42088172161373766, + "grad_norm": 0.737324595451355, + "learning_rate": 8.9674482861759e-06, + "loss": 0.8486, + "step": 7647 + }, + { + "epoch": 0.42093676041609335, + "grad_norm": 0.6763800382614136, + "learning_rate": 8.967184470533818e-06, + "loss": 0.72, + "step": 7648 + }, + { + "epoch": 0.420991799218449, + "grad_norm": 0.7560757994651794, + "learning_rate": 8.96692062507544e-06, + "loss": 0.7704, + "step": 7649 + }, + { + "epoch": 0.42104683802080467, + "grad_norm": 0.7289260029792786, + "learning_rate": 8.966656749802748e-06, + "loss": 0.7411, + "step": 7650 + }, + { + "epoch": 0.4211018768231603, + "grad_norm": 0.6935442686080933, + "learning_rate": 8.966392844717726e-06, + "loss": 0.7848, + "step": 7651 + }, + { + "epoch": 0.421156915625516, + "grad_norm": 0.7111918330192566, + "learning_rate": 8.966128909822356e-06, + "loss": 0.8377, + "step": 7652 + }, + { + "epoch": 0.4212119544278716, + "grad_norm": 0.8594884872436523, + "learning_rate": 8.965864945118625e-06, + "loss": 0.8227, + "step": 7653 + }, + { + "epoch": 0.4212669932302273, + "grad_norm": 0.6521008014678955, + "learning_rate": 8.965600950608513e-06, + "loss": 0.7034, + "step": 7654 + }, + { + "epoch": 0.42132203203258295, + "grad_norm": 0.6362404823303223, + "learning_rate": 8.965336926294007e-06, + "loss": 0.6712, + "step": 7655 + }, + { + "epoch": 0.42137707083493864, + "grad_norm": 0.6955040097236633, + "learning_rate": 8.965072872177088e-06, + "loss": 0.7789, + "step": 7656 + }, + { + "epoch": 0.42143210963729427, + "grad_norm": 0.7311720252037048, + "learning_rate": 8.964808788259745e-06, + "loss": 0.7522, + "step": 7657 + }, + { + "epoch": 0.42148714843964996, + "grad_norm": 0.781131386756897, + "learning_rate": 8.96454467454396e-06, + "loss": 0.7831, + "step": 7658 + }, + { + "epoch": 0.4215421872420056, + "grad_norm": 0.6740639805793762, + "learning_rate": 8.964280531031718e-06, + "loss": 0.7102, + "step": 7659 + }, + { + "epoch": 0.4215972260443613, + "grad_norm": 0.7843424677848816, + "learning_rate": 8.964016357725003e-06, + "loss": 0.8325, + "step": 7660 + }, + { + "epoch": 0.4216522648467169, + "grad_norm": 0.7833517789840698, + "learning_rate": 8.963752154625804e-06, + "loss": 0.8603, + "step": 7661 + }, + { + "epoch": 0.4217073036490726, + "grad_norm": 0.7270992994308472, + "learning_rate": 8.963487921736104e-06, + "loss": 0.745, + "step": 7662 + }, + { + "epoch": 0.42176234245142824, + "grad_norm": 0.6517582535743713, + "learning_rate": 8.963223659057892e-06, + "loss": 0.6983, + "step": 7663 + }, + { + "epoch": 0.42181738125378393, + "grad_norm": 0.6974934935569763, + "learning_rate": 8.962959366593149e-06, + "loss": 0.733, + "step": 7664 + }, + { + "epoch": 0.42187242005613956, + "grad_norm": 0.712045431137085, + "learning_rate": 8.962695044343865e-06, + "loss": 0.725, + "step": 7665 + }, + { + "epoch": 0.42192745885849525, + "grad_norm": 0.7311459183692932, + "learning_rate": 8.962430692312028e-06, + "loss": 0.8025, + "step": 7666 + }, + { + "epoch": 0.4219824976608509, + "grad_norm": 0.7439966201782227, + "learning_rate": 8.962166310499621e-06, + "loss": 0.7711, + "step": 7667 + }, + { + "epoch": 0.4220375364632066, + "grad_norm": 0.690832257270813, + "learning_rate": 8.961901898908632e-06, + "loss": 0.8414, + "step": 7668 + }, + { + "epoch": 0.4220925752655622, + "grad_norm": 0.8437964916229248, + "learning_rate": 8.961637457541049e-06, + "loss": 0.8253, + "step": 7669 + }, + { + "epoch": 0.4221476140679179, + "grad_norm": 0.7876344323158264, + "learning_rate": 8.96137298639886e-06, + "loss": 0.754, + "step": 7670 + }, + { + "epoch": 0.42220265287027353, + "grad_norm": 0.7551780343055725, + "learning_rate": 8.961108485484052e-06, + "loss": 0.8555, + "step": 7671 + }, + { + "epoch": 0.4222576916726292, + "grad_norm": 0.6867276430130005, + "learning_rate": 8.96084395479861e-06, + "loss": 0.7216, + "step": 7672 + }, + { + "epoch": 0.42231273047498485, + "grad_norm": 0.9052873849868774, + "learning_rate": 8.960579394344528e-06, + "loss": 0.7945, + "step": 7673 + }, + { + "epoch": 0.42236776927734054, + "grad_norm": 0.6731994152069092, + "learning_rate": 8.96031480412379e-06, + "loss": 0.7691, + "step": 7674 + }, + { + "epoch": 0.4224228080796962, + "grad_norm": 0.7074670195579529, + "learning_rate": 8.960050184138389e-06, + "loss": 0.8008, + "step": 7675 + }, + { + "epoch": 0.42247784688205187, + "grad_norm": 0.9482604265213013, + "learning_rate": 8.959785534390309e-06, + "loss": 0.7095, + "step": 7676 + }, + { + "epoch": 0.4225328856844075, + "grad_norm": 0.6915413737297058, + "learning_rate": 8.95952085488154e-06, + "loss": 0.6717, + "step": 7677 + }, + { + "epoch": 0.4225879244867632, + "grad_norm": 0.7565900087356567, + "learning_rate": 8.959256145614073e-06, + "loss": 0.8311, + "step": 7678 + }, + { + "epoch": 0.4226429632891188, + "grad_norm": 0.8307167887687683, + "learning_rate": 8.958991406589896e-06, + "loss": 0.8585, + "step": 7679 + }, + { + "epoch": 0.4226980020914745, + "grad_norm": 0.7955091595649719, + "learning_rate": 8.958726637811e-06, + "loss": 0.8154, + "step": 7680 + }, + { + "epoch": 0.42275304089383015, + "grad_norm": 0.7692292332649231, + "learning_rate": 8.958461839279376e-06, + "loss": 0.7965, + "step": 7681 + }, + { + "epoch": 0.42280807969618583, + "grad_norm": 0.7355942726135254, + "learning_rate": 8.95819701099701e-06, + "loss": 0.7557, + "step": 7682 + }, + { + "epoch": 0.42286311849854147, + "grad_norm": 0.8781518936157227, + "learning_rate": 8.957932152965895e-06, + "loss": 0.8033, + "step": 7683 + }, + { + "epoch": 0.42291815730089716, + "grad_norm": 0.7180802226066589, + "learning_rate": 8.957667265188022e-06, + "loss": 0.7283, + "step": 7684 + }, + { + "epoch": 0.4229731961032528, + "grad_norm": 0.6967236995697021, + "learning_rate": 8.95740234766538e-06, + "loss": 0.769, + "step": 7685 + }, + { + "epoch": 0.4230282349056085, + "grad_norm": 0.7462503910064697, + "learning_rate": 8.957137400399963e-06, + "loss": 0.8179, + "step": 7686 + }, + { + "epoch": 0.4230832737079641, + "grad_norm": 0.67714524269104, + "learning_rate": 8.956872423393761e-06, + "loss": 0.7976, + "step": 7687 + }, + { + "epoch": 0.4231383125103198, + "grad_norm": 0.8239946365356445, + "learning_rate": 8.956607416648763e-06, + "loss": 0.7946, + "step": 7688 + }, + { + "epoch": 0.42319335131267544, + "grad_norm": 0.6724610924720764, + "learning_rate": 8.956342380166963e-06, + "loss": 0.7633, + "step": 7689 + }, + { + "epoch": 0.42324839011503107, + "grad_norm": 0.744987964630127, + "learning_rate": 8.956077313950354e-06, + "loss": 0.9028, + "step": 7690 + }, + { + "epoch": 0.42330342891738676, + "grad_norm": 0.7700596451759338, + "learning_rate": 8.955812218000925e-06, + "loss": 0.8954, + "step": 7691 + }, + { + "epoch": 0.4233584677197424, + "grad_norm": 0.6952996253967285, + "learning_rate": 8.955547092320673e-06, + "loss": 0.8094, + "step": 7692 + }, + { + "epoch": 0.4234135065220981, + "grad_norm": 0.6410536766052246, + "learning_rate": 8.955281936911586e-06, + "loss": 0.6281, + "step": 7693 + }, + { + "epoch": 0.4234685453244537, + "grad_norm": 1.0939754247665405, + "learning_rate": 8.95501675177566e-06, + "loss": 0.8239, + "step": 7694 + }, + { + "epoch": 0.4235235841268094, + "grad_norm": 0.7419464588165283, + "learning_rate": 8.954751536914885e-06, + "loss": 0.8015, + "step": 7695 + }, + { + "epoch": 0.42357862292916504, + "grad_norm": 0.8171356320381165, + "learning_rate": 8.954486292331257e-06, + "loss": 0.8183, + "step": 7696 + }, + { + "epoch": 0.4236336617315207, + "grad_norm": 0.745884358882904, + "learning_rate": 8.95422101802677e-06, + "loss": 0.7457, + "step": 7697 + }, + { + "epoch": 0.42368870053387636, + "grad_norm": 0.7355740070343018, + "learning_rate": 8.953955714003414e-06, + "loss": 0.7517, + "step": 7698 + }, + { + "epoch": 0.42374373933623205, + "grad_norm": 0.7103458642959595, + "learning_rate": 8.953690380263186e-06, + "loss": 0.7306, + "step": 7699 + }, + { + "epoch": 0.4237987781385877, + "grad_norm": 0.7453970909118652, + "learning_rate": 8.95342501680808e-06, + "loss": 0.8396, + "step": 7700 + }, + { + "epoch": 0.4238538169409434, + "grad_norm": 0.7132760286331177, + "learning_rate": 8.953159623640088e-06, + "loss": 0.7861, + "step": 7701 + }, + { + "epoch": 0.423908855743299, + "grad_norm": 0.785827100276947, + "learning_rate": 8.952894200761209e-06, + "loss": 0.8681, + "step": 7702 + }, + { + "epoch": 0.4239638945456547, + "grad_norm": 0.7075281143188477, + "learning_rate": 8.952628748173433e-06, + "loss": 0.7257, + "step": 7703 + }, + { + "epoch": 0.42401893334801033, + "grad_norm": 0.8205186724662781, + "learning_rate": 8.952363265878758e-06, + "loss": 0.7361, + "step": 7704 + }, + { + "epoch": 0.424073972150366, + "grad_norm": 0.6517061591148376, + "learning_rate": 8.952097753879181e-06, + "loss": 0.7127, + "step": 7705 + }, + { + "epoch": 0.42412901095272165, + "grad_norm": 0.7252761125564575, + "learning_rate": 8.951832212176692e-06, + "loss": 0.796, + "step": 7706 + }, + { + "epoch": 0.42418404975507734, + "grad_norm": 0.6688609719276428, + "learning_rate": 8.951566640773292e-06, + "loss": 0.7698, + "step": 7707 + }, + { + "epoch": 0.424239088557433, + "grad_norm": 0.7163566946983337, + "learning_rate": 8.951301039670974e-06, + "loss": 0.8069, + "step": 7708 + }, + { + "epoch": 0.42429412735978866, + "grad_norm": 0.7027623057365417, + "learning_rate": 8.951035408871735e-06, + "loss": 0.7061, + "step": 7709 + }, + { + "epoch": 0.4243491661621443, + "grad_norm": 0.9558683037757874, + "learning_rate": 8.950769748377572e-06, + "loss": 0.926, + "step": 7710 + }, + { + "epoch": 0.4244042049645, + "grad_norm": 0.7173893451690674, + "learning_rate": 8.950504058190482e-06, + "loss": 0.7519, + "step": 7711 + }, + { + "epoch": 0.4244592437668556, + "grad_norm": 0.8481128811836243, + "learning_rate": 8.950238338312459e-06, + "loss": 0.7804, + "step": 7712 + }, + { + "epoch": 0.4245142825692113, + "grad_norm": 0.6957072615623474, + "learning_rate": 8.949972588745502e-06, + "loss": 0.611, + "step": 7713 + }, + { + "epoch": 0.42456932137156694, + "grad_norm": 0.7910122871398926, + "learning_rate": 8.94970680949161e-06, + "loss": 0.8435, + "step": 7714 + }, + { + "epoch": 0.42462436017392263, + "grad_norm": 0.8068616986274719, + "learning_rate": 8.949441000552777e-06, + "loss": 0.8658, + "step": 7715 + }, + { + "epoch": 0.42467939897627827, + "grad_norm": 0.718110978603363, + "learning_rate": 8.949175161931006e-06, + "loss": 0.7908, + "step": 7716 + }, + { + "epoch": 0.42473443777863396, + "grad_norm": 0.7329656481742859, + "learning_rate": 8.948909293628289e-06, + "loss": 0.7477, + "step": 7717 + }, + { + "epoch": 0.4247894765809896, + "grad_norm": 0.7046940326690674, + "learning_rate": 8.948643395646625e-06, + "loss": 0.7985, + "step": 7718 + }, + { + "epoch": 0.4248445153833453, + "grad_norm": 0.6699581742286682, + "learning_rate": 8.948377467988017e-06, + "loss": 0.6575, + "step": 7719 + }, + { + "epoch": 0.4248995541857009, + "grad_norm": 0.8055217266082764, + "learning_rate": 8.94811151065446e-06, + "loss": 0.7008, + "step": 7720 + }, + { + "epoch": 0.4249545929880566, + "grad_norm": 0.8374543190002441, + "learning_rate": 8.947845523647954e-06, + "loss": 0.8918, + "step": 7721 + }, + { + "epoch": 0.42500963179041223, + "grad_norm": 0.6974833607673645, + "learning_rate": 8.947579506970498e-06, + "loss": 0.8594, + "step": 7722 + }, + { + "epoch": 0.4250646705927679, + "grad_norm": 0.7466567754745483, + "learning_rate": 8.947313460624091e-06, + "loss": 0.6935, + "step": 7723 + }, + { + "epoch": 0.42511970939512356, + "grad_norm": 0.8118101358413696, + "learning_rate": 8.947047384610734e-06, + "loss": 0.8432, + "step": 7724 + }, + { + "epoch": 0.42517474819747925, + "grad_norm": 0.6885644197463989, + "learning_rate": 8.946781278932422e-06, + "loss": 0.8059, + "step": 7725 + }, + { + "epoch": 0.4252297869998349, + "grad_norm": 0.7257012128829956, + "learning_rate": 8.94651514359116e-06, + "loss": 0.8239, + "step": 7726 + }, + { + "epoch": 0.42528482580219057, + "grad_norm": 1.311591386795044, + "learning_rate": 8.946248978588947e-06, + "loss": 0.8207, + "step": 7727 + }, + { + "epoch": 0.4253398646045462, + "grad_norm": 0.7694151997566223, + "learning_rate": 8.945982783927784e-06, + "loss": 0.8948, + "step": 7728 + }, + { + "epoch": 0.4253949034069019, + "grad_norm": 0.6922980546951294, + "learning_rate": 8.945716559609669e-06, + "loss": 0.7883, + "step": 7729 + }, + { + "epoch": 0.4254499422092575, + "grad_norm": 0.7803757786750793, + "learning_rate": 8.945450305636605e-06, + "loss": 0.9166, + "step": 7730 + }, + { + "epoch": 0.4255049810116132, + "grad_norm": 0.6775311827659607, + "learning_rate": 8.945184022010593e-06, + "loss": 0.6976, + "step": 7731 + }, + { + "epoch": 0.42556001981396885, + "grad_norm": 0.7108052968978882, + "learning_rate": 8.944917708733634e-06, + "loss": 0.7763, + "step": 7732 + }, + { + "epoch": 0.4256150586163245, + "grad_norm": 0.7215770483016968, + "learning_rate": 8.94465136580773e-06, + "loss": 0.7907, + "step": 7733 + }, + { + "epoch": 0.42567009741868017, + "grad_norm": 0.6690788865089417, + "learning_rate": 8.944384993234881e-06, + "loss": 0.8403, + "step": 7734 + }, + { + "epoch": 0.4257251362210358, + "grad_norm": 0.7372478246688843, + "learning_rate": 8.94411859101709e-06, + "loss": 0.7618, + "step": 7735 + }, + { + "epoch": 0.4257801750233915, + "grad_norm": 0.9398306608200073, + "learning_rate": 8.94385215915636e-06, + "loss": 0.9043, + "step": 7736 + }, + { + "epoch": 0.4258352138257471, + "grad_norm": 0.8790311217308044, + "learning_rate": 8.943585697654693e-06, + "loss": 0.9378, + "step": 7737 + }, + { + "epoch": 0.4258902526281028, + "grad_norm": 0.7579166889190674, + "learning_rate": 8.943319206514091e-06, + "loss": 0.7913, + "step": 7738 + }, + { + "epoch": 0.42594529143045845, + "grad_norm": 0.6426860690116882, + "learning_rate": 8.943052685736559e-06, + "loss": 0.744, + "step": 7739 + }, + { + "epoch": 0.42600033023281414, + "grad_norm": 0.688117265701294, + "learning_rate": 8.942786135324098e-06, + "loss": 0.8386, + "step": 7740 + }, + { + "epoch": 0.4260553690351698, + "grad_norm": 0.7178692817687988, + "learning_rate": 8.94251955527871e-06, + "loss": 0.7937, + "step": 7741 + }, + { + "epoch": 0.42611040783752546, + "grad_norm": 0.7980415225028992, + "learning_rate": 8.942252945602403e-06, + "loss": 0.76, + "step": 7742 + }, + { + "epoch": 0.4261654466398811, + "grad_norm": 0.6858333349227905, + "learning_rate": 8.941986306297175e-06, + "loss": 0.8155, + "step": 7743 + }, + { + "epoch": 0.4262204854422368, + "grad_norm": 0.763297975063324, + "learning_rate": 8.941719637365037e-06, + "loss": 0.8003, + "step": 7744 + }, + { + "epoch": 0.4262755242445924, + "grad_norm": 0.661016047000885, + "learning_rate": 8.941452938807986e-06, + "loss": 0.6788, + "step": 7745 + }, + { + "epoch": 0.4263305630469481, + "grad_norm": 0.7168089151382446, + "learning_rate": 8.94118621062803e-06, + "loss": 0.7791, + "step": 7746 + }, + { + "epoch": 0.42638560184930374, + "grad_norm": 0.6879743337631226, + "learning_rate": 8.940919452827174e-06, + "loss": 0.7978, + "step": 7747 + }, + { + "epoch": 0.42644064065165943, + "grad_norm": 0.672298014163971, + "learning_rate": 8.940652665407424e-06, + "loss": 0.7569, + "step": 7748 + }, + { + "epoch": 0.42649567945401506, + "grad_norm": 0.7237414717674255, + "learning_rate": 8.940385848370782e-06, + "loss": 0.6788, + "step": 7749 + }, + { + "epoch": 0.42655071825637075, + "grad_norm": 0.6793895363807678, + "learning_rate": 8.940119001719255e-06, + "loss": 0.749, + "step": 7750 + }, + { + "epoch": 0.4266057570587264, + "grad_norm": 1.1172789335250854, + "learning_rate": 8.939852125454847e-06, + "loss": 0.9017, + "step": 7751 + }, + { + "epoch": 0.4266607958610821, + "grad_norm": 0.7138717770576477, + "learning_rate": 8.939585219579567e-06, + "loss": 0.8586, + "step": 7752 + }, + { + "epoch": 0.4267158346634377, + "grad_norm": 0.8678629398345947, + "learning_rate": 8.939318284095417e-06, + "loss": 0.7333, + "step": 7753 + }, + { + "epoch": 0.4267708734657934, + "grad_norm": 0.7274941802024841, + "learning_rate": 8.939051319004407e-06, + "loss": 0.8426, + "step": 7754 + }, + { + "epoch": 0.42682591226814903, + "grad_norm": 0.6845358610153198, + "learning_rate": 8.93878432430854e-06, + "loss": 0.7731, + "step": 7755 + }, + { + "epoch": 0.4268809510705047, + "grad_norm": 0.7042781710624695, + "learning_rate": 8.938517300009826e-06, + "loss": 0.6703, + "step": 7756 + }, + { + "epoch": 0.42693598987286036, + "grad_norm": 0.7147190570831299, + "learning_rate": 8.93825024611027e-06, + "loss": 0.7977, + "step": 7757 + }, + { + "epoch": 0.42699102867521604, + "grad_norm": 0.6584187150001526, + "learning_rate": 8.93798316261188e-06, + "loss": 0.716, + "step": 7758 + }, + { + "epoch": 0.4270460674775717, + "grad_norm": 0.8061439990997314, + "learning_rate": 8.93771604951666e-06, + "loss": 0.9075, + "step": 7759 + }, + { + "epoch": 0.42710110627992737, + "grad_norm": 0.6741406917572021, + "learning_rate": 8.937448906826622e-06, + "loss": 0.7828, + "step": 7760 + }, + { + "epoch": 0.427156145082283, + "grad_norm": 0.8791692852973938, + "learning_rate": 8.937181734543773e-06, + "loss": 0.7685, + "step": 7761 + }, + { + "epoch": 0.4272111838846387, + "grad_norm": 0.6804112195968628, + "learning_rate": 8.936914532670119e-06, + "loss": 0.7672, + "step": 7762 + }, + { + "epoch": 0.4272662226869943, + "grad_norm": 0.6983451843261719, + "learning_rate": 8.936647301207668e-06, + "loss": 0.8228, + "step": 7763 + }, + { + "epoch": 0.42732126148935, + "grad_norm": 0.8248929977416992, + "learning_rate": 8.936380040158432e-06, + "loss": 0.7628, + "step": 7764 + }, + { + "epoch": 0.42737630029170565, + "grad_norm": 0.8324941992759705, + "learning_rate": 8.936112749524415e-06, + "loss": 0.8125, + "step": 7765 + }, + { + "epoch": 0.42743133909406134, + "grad_norm": 0.7489150762557983, + "learning_rate": 8.935845429307631e-06, + "loss": 0.8766, + "step": 7766 + }, + { + "epoch": 0.42748637789641697, + "grad_norm": 0.7323104739189148, + "learning_rate": 8.935578079510083e-06, + "loss": 0.8607, + "step": 7767 + }, + { + "epoch": 0.42754141669877266, + "grad_norm": 0.6825152635574341, + "learning_rate": 8.935310700133786e-06, + "loss": 0.7817, + "step": 7768 + }, + { + "epoch": 0.4275964555011283, + "grad_norm": 0.8928677439689636, + "learning_rate": 8.935043291180748e-06, + "loss": 0.7621, + "step": 7769 + }, + { + "epoch": 0.427651494303484, + "grad_norm": 0.7071405649185181, + "learning_rate": 8.934775852652975e-06, + "loss": 0.7798, + "step": 7770 + }, + { + "epoch": 0.4277065331058396, + "grad_norm": 0.8225427269935608, + "learning_rate": 8.934508384552481e-06, + "loss": 0.7212, + "step": 7771 + }, + { + "epoch": 0.4277615719081953, + "grad_norm": 0.6931234002113342, + "learning_rate": 8.934240886881276e-06, + "loss": 0.7301, + "step": 7772 + }, + { + "epoch": 0.42781661071055094, + "grad_norm": 0.6901859641075134, + "learning_rate": 8.933973359641369e-06, + "loss": 0.6974, + "step": 7773 + }, + { + "epoch": 0.4278716495129066, + "grad_norm": 0.7736960649490356, + "learning_rate": 8.93370580283477e-06, + "loss": 0.6562, + "step": 7774 + }, + { + "epoch": 0.42792668831526226, + "grad_norm": 0.7363499999046326, + "learning_rate": 8.933438216463495e-06, + "loss": 0.8274, + "step": 7775 + }, + { + "epoch": 0.4279817271176179, + "grad_norm": 0.6855602860450745, + "learning_rate": 8.933170600529548e-06, + "loss": 0.7576, + "step": 7776 + }, + { + "epoch": 0.4280367659199736, + "grad_norm": 0.7641676664352417, + "learning_rate": 8.932902955034945e-06, + "loss": 0.7837, + "step": 7777 + }, + { + "epoch": 0.4280918047223292, + "grad_norm": 0.74812251329422, + "learning_rate": 8.932635279981695e-06, + "loss": 0.8402, + "step": 7778 + }, + { + "epoch": 0.4281468435246849, + "grad_norm": 0.7445259094238281, + "learning_rate": 8.932367575371813e-06, + "loss": 0.862, + "step": 7779 + }, + { + "epoch": 0.42820188232704054, + "grad_norm": 0.8977177739143372, + "learning_rate": 8.932099841207306e-06, + "loss": 0.7735, + "step": 7780 + }, + { + "epoch": 0.42825692112939623, + "grad_norm": 0.74172043800354, + "learning_rate": 8.93183207749019e-06, + "loss": 0.7053, + "step": 7781 + }, + { + "epoch": 0.42831195993175186, + "grad_norm": 0.6670083999633789, + "learning_rate": 8.931564284222479e-06, + "loss": 0.6348, + "step": 7782 + }, + { + "epoch": 0.42836699873410755, + "grad_norm": 0.7575422525405884, + "learning_rate": 8.93129646140618e-06, + "loss": 0.9354, + "step": 7783 + }, + { + "epoch": 0.4284220375364632, + "grad_norm": 0.7436977624893188, + "learning_rate": 8.931028609043311e-06, + "loss": 0.7461, + "step": 7784 + }, + { + "epoch": 0.4284770763388189, + "grad_norm": 0.7383070588111877, + "learning_rate": 8.930760727135882e-06, + "loss": 0.7629, + "step": 7785 + }, + { + "epoch": 0.4285321151411745, + "grad_norm": 0.6926067471504211, + "learning_rate": 8.93049281568591e-06, + "loss": 0.6788, + "step": 7786 + }, + { + "epoch": 0.4285871539435302, + "grad_norm": 0.7680530548095703, + "learning_rate": 8.930224874695404e-06, + "loss": 0.722, + "step": 7787 + }, + { + "epoch": 0.42864219274588583, + "grad_norm": 0.9880867004394531, + "learning_rate": 8.92995690416638e-06, + "loss": 0.833, + "step": 7788 + }, + { + "epoch": 0.4286972315482415, + "grad_norm": 0.7915430068969727, + "learning_rate": 8.929688904100853e-06, + "loss": 0.7643, + "step": 7789 + }, + { + "epoch": 0.42875227035059715, + "grad_norm": 0.6972275376319885, + "learning_rate": 8.929420874500836e-06, + "loss": 0.7697, + "step": 7790 + }, + { + "epoch": 0.42880730915295284, + "grad_norm": 0.9583331346511841, + "learning_rate": 8.929152815368343e-06, + "loss": 0.7591, + "step": 7791 + }, + { + "epoch": 0.4288623479553085, + "grad_norm": 0.7254299521446228, + "learning_rate": 8.928884726705388e-06, + "loss": 0.7913, + "step": 7792 + }, + { + "epoch": 0.42891738675766417, + "grad_norm": 0.7925865054130554, + "learning_rate": 8.928616608513989e-06, + "loss": 0.8248, + "step": 7793 + }, + { + "epoch": 0.4289724255600198, + "grad_norm": 0.9367457628250122, + "learning_rate": 8.928348460796157e-06, + "loss": 0.7767, + "step": 7794 + }, + { + "epoch": 0.4290274643623755, + "grad_norm": 0.8511868119239807, + "learning_rate": 8.928080283553912e-06, + "loss": 0.841, + "step": 7795 + }, + { + "epoch": 0.4290825031647311, + "grad_norm": 0.8518061637878418, + "learning_rate": 8.927812076789267e-06, + "loss": 0.7907, + "step": 7796 + }, + { + "epoch": 0.4291375419670868, + "grad_norm": 0.7208365797996521, + "learning_rate": 8.927543840504236e-06, + "loss": 0.7344, + "step": 7797 + }, + { + "epoch": 0.42919258076944244, + "grad_norm": 0.7541850209236145, + "learning_rate": 8.927275574700838e-06, + "loss": 0.7724, + "step": 7798 + }, + { + "epoch": 0.42924761957179813, + "grad_norm": 0.7378629446029663, + "learning_rate": 8.927007279381087e-06, + "loss": 0.7614, + "step": 7799 + }, + { + "epoch": 0.42930265837415377, + "grad_norm": 0.7358561158180237, + "learning_rate": 8.926738954547001e-06, + "loss": 0.7288, + "step": 7800 + }, + { + "epoch": 0.42935769717650946, + "grad_norm": 0.7385967969894409, + "learning_rate": 8.926470600200597e-06, + "loss": 0.7562, + "step": 7801 + }, + { + "epoch": 0.4294127359788651, + "grad_norm": 0.6904877424240112, + "learning_rate": 8.92620221634389e-06, + "loss": 0.6507, + "step": 7802 + }, + { + "epoch": 0.4294677747812208, + "grad_norm": 0.7205148935317993, + "learning_rate": 8.925933802978898e-06, + "loss": 0.7683, + "step": 7803 + }, + { + "epoch": 0.4295228135835764, + "grad_norm": 0.6830344200134277, + "learning_rate": 8.925665360107639e-06, + "loss": 0.6886, + "step": 7804 + }, + { + "epoch": 0.4295778523859321, + "grad_norm": 0.7648812532424927, + "learning_rate": 8.92539688773213e-06, + "loss": 0.7559, + "step": 7805 + }, + { + "epoch": 0.42963289118828774, + "grad_norm": 0.7819112539291382, + "learning_rate": 8.925128385854389e-06, + "loss": 0.7443, + "step": 7806 + }, + { + "epoch": 0.4296879299906434, + "grad_norm": 0.6742433309555054, + "learning_rate": 8.924859854476433e-06, + "loss": 0.7191, + "step": 7807 + }, + { + "epoch": 0.42974296879299906, + "grad_norm": 0.7368177771568298, + "learning_rate": 8.924591293600281e-06, + "loss": 0.6946, + "step": 7808 + }, + { + "epoch": 0.42979800759535475, + "grad_norm": 0.663112998008728, + "learning_rate": 8.924322703227953e-06, + "loss": 0.7405, + "step": 7809 + }, + { + "epoch": 0.4298530463977104, + "grad_norm": 0.6735410690307617, + "learning_rate": 8.924054083361465e-06, + "loss": 0.7982, + "step": 7810 + }, + { + "epoch": 0.42990808520006607, + "grad_norm": 0.7770369648933411, + "learning_rate": 8.923785434002834e-06, + "loss": 0.9179, + "step": 7811 + }, + { + "epoch": 0.4299631240024217, + "grad_norm": 0.7464482188224792, + "learning_rate": 8.923516755154085e-06, + "loss": 0.8514, + "step": 7812 + }, + { + "epoch": 0.4300181628047774, + "grad_norm": 0.9249551892280579, + "learning_rate": 8.923248046817235e-06, + "loss": 0.8287, + "step": 7813 + }, + { + "epoch": 0.430073201607133, + "grad_norm": 0.7071338891983032, + "learning_rate": 8.922979308994302e-06, + "loss": 0.7509, + "step": 7814 + }, + { + "epoch": 0.4301282404094887, + "grad_norm": 0.6910794377326965, + "learning_rate": 8.922710541687305e-06, + "loss": 0.7373, + "step": 7815 + }, + { + "epoch": 0.43018327921184435, + "grad_norm": 0.8424028158187866, + "learning_rate": 8.922441744898267e-06, + "loss": 0.741, + "step": 7816 + }, + { + "epoch": 0.43023831801420004, + "grad_norm": 0.8162125945091248, + "learning_rate": 8.922172918629208e-06, + "loss": 0.8044, + "step": 7817 + }, + { + "epoch": 0.43029335681655567, + "grad_norm": 0.7415170669555664, + "learning_rate": 8.921904062882145e-06, + "loss": 0.7427, + "step": 7818 + }, + { + "epoch": 0.4303483956189113, + "grad_norm": 1.1357808113098145, + "learning_rate": 8.921635177659103e-06, + "loss": 0.7802, + "step": 7819 + }, + { + "epoch": 0.430403434421267, + "grad_norm": 0.7039839625358582, + "learning_rate": 8.9213662629621e-06, + "loss": 0.7368, + "step": 7820 + }, + { + "epoch": 0.43045847322362263, + "grad_norm": 0.721077024936676, + "learning_rate": 8.921097318793157e-06, + "loss": 0.6575, + "step": 7821 + }, + { + "epoch": 0.4305135120259783, + "grad_norm": 0.7823510766029358, + "learning_rate": 8.920828345154297e-06, + "loss": 0.7499, + "step": 7822 + }, + { + "epoch": 0.43056855082833395, + "grad_norm": 0.6400569677352905, + "learning_rate": 8.920559342047539e-06, + "loss": 0.7091, + "step": 7823 + }, + { + "epoch": 0.43062358963068964, + "grad_norm": 0.8974951505661011, + "learning_rate": 8.920290309474908e-06, + "loss": 0.7228, + "step": 7824 + }, + { + "epoch": 0.4306786284330453, + "grad_norm": 0.8176010847091675, + "learning_rate": 8.920021247438426e-06, + "loss": 0.8852, + "step": 7825 + }, + { + "epoch": 0.43073366723540096, + "grad_norm": 0.7591422200202942, + "learning_rate": 8.919752155940112e-06, + "loss": 0.8382, + "step": 7826 + }, + { + "epoch": 0.4307887060377566, + "grad_norm": 0.7089776396751404, + "learning_rate": 8.919483034981988e-06, + "loss": 0.7188, + "step": 7827 + }, + { + "epoch": 0.4308437448401123, + "grad_norm": 0.7328840494155884, + "learning_rate": 8.919213884566081e-06, + "loss": 0.7609, + "step": 7828 + }, + { + "epoch": 0.4308987836424679, + "grad_norm": 0.6473509669303894, + "learning_rate": 8.918944704694411e-06, + "loss": 0.7027, + "step": 7829 + }, + { + "epoch": 0.4309538224448236, + "grad_norm": 0.6585624814033508, + "learning_rate": 8.918675495369003e-06, + "loss": 0.7133, + "step": 7830 + }, + { + "epoch": 0.43100886124717924, + "grad_norm": 0.7232397794723511, + "learning_rate": 8.918406256591876e-06, + "loss": 0.7458, + "step": 7831 + }, + { + "epoch": 0.43106390004953493, + "grad_norm": 0.8752645254135132, + "learning_rate": 8.918136988365059e-06, + "loss": 0.671, + "step": 7832 + }, + { + "epoch": 0.43111893885189057, + "grad_norm": 0.7890885472297668, + "learning_rate": 8.917867690690573e-06, + "loss": 0.7674, + "step": 7833 + }, + { + "epoch": 0.43117397765424625, + "grad_norm": 0.6725128293037415, + "learning_rate": 8.917598363570441e-06, + "loss": 0.7373, + "step": 7834 + }, + { + "epoch": 0.4312290164566019, + "grad_norm": 0.808897852897644, + "learning_rate": 8.917329007006688e-06, + "loss": 0.8397, + "step": 7835 + }, + { + "epoch": 0.4312840552589576, + "grad_norm": 0.7268605828285217, + "learning_rate": 8.91705962100134e-06, + "loss": 0.7957, + "step": 7836 + }, + { + "epoch": 0.4313390940613132, + "grad_norm": 0.7336069345474243, + "learning_rate": 8.916790205556421e-06, + "loss": 0.746, + "step": 7837 + }, + { + "epoch": 0.4313941328636689, + "grad_norm": 0.7380902171134949, + "learning_rate": 8.916520760673955e-06, + "loss": 0.674, + "step": 7838 + }, + { + "epoch": 0.43144917166602453, + "grad_norm": 0.8041831851005554, + "learning_rate": 8.916251286355967e-06, + "loss": 0.8392, + "step": 7839 + }, + { + "epoch": 0.4315042104683802, + "grad_norm": 0.6745681166648865, + "learning_rate": 8.915981782604481e-06, + "loss": 0.7676, + "step": 7840 + }, + { + "epoch": 0.43155924927073586, + "grad_norm": 0.6572039127349854, + "learning_rate": 8.915712249421526e-06, + "loss": 0.7471, + "step": 7841 + }, + { + "epoch": 0.43161428807309155, + "grad_norm": 0.7250062227249146, + "learning_rate": 8.915442686809124e-06, + "loss": 0.8566, + "step": 7842 + }, + { + "epoch": 0.4316693268754472, + "grad_norm": 0.7008941769599915, + "learning_rate": 8.915173094769306e-06, + "loss": 0.7876, + "step": 7843 + }, + { + "epoch": 0.43172436567780287, + "grad_norm": 0.7078337073326111, + "learning_rate": 8.914903473304093e-06, + "loss": 0.756, + "step": 7844 + }, + { + "epoch": 0.4317794044801585, + "grad_norm": 0.7822949886322021, + "learning_rate": 8.914633822415513e-06, + "loss": 0.9423, + "step": 7845 + }, + { + "epoch": 0.4318344432825142, + "grad_norm": 0.6707580089569092, + "learning_rate": 8.914364142105593e-06, + "loss": 0.639, + "step": 7846 + }, + { + "epoch": 0.4318894820848698, + "grad_norm": 0.7868423461914062, + "learning_rate": 8.914094432376362e-06, + "loss": 0.7768, + "step": 7847 + }, + { + "epoch": 0.4319445208872255, + "grad_norm": 0.6147592067718506, + "learning_rate": 8.913824693229845e-06, + "loss": 0.6693, + "step": 7848 + }, + { + "epoch": 0.43199955968958115, + "grad_norm": 0.6901249885559082, + "learning_rate": 8.913554924668067e-06, + "loss": 0.7779, + "step": 7849 + }, + { + "epoch": 0.43205459849193684, + "grad_norm": 0.7062137126922607, + "learning_rate": 8.913285126693058e-06, + "loss": 0.7951, + "step": 7850 + }, + { + "epoch": 0.43210963729429247, + "grad_norm": 0.6363390684127808, + "learning_rate": 8.913015299306846e-06, + "loss": 0.6723, + "step": 7851 + }, + { + "epoch": 0.43216467609664816, + "grad_norm": 0.7168677449226379, + "learning_rate": 8.912745442511459e-06, + "loss": 0.7442, + "step": 7852 + }, + { + "epoch": 0.4322197148990038, + "grad_norm": 0.7347995042800903, + "learning_rate": 8.912475556308925e-06, + "loss": 0.8361, + "step": 7853 + }, + { + "epoch": 0.4322747537013595, + "grad_norm": 0.683777391910553, + "learning_rate": 8.91220564070127e-06, + "loss": 0.7583, + "step": 7854 + }, + { + "epoch": 0.4323297925037151, + "grad_norm": 0.7436330914497375, + "learning_rate": 8.911935695690527e-06, + "loss": 0.8414, + "step": 7855 + }, + { + "epoch": 0.4323848313060708, + "grad_norm": 0.7748109102249146, + "learning_rate": 8.911665721278721e-06, + "loss": 0.7812, + "step": 7856 + }, + { + "epoch": 0.43243987010842644, + "grad_norm": 0.7984411120414734, + "learning_rate": 8.911395717467883e-06, + "loss": 0.6845, + "step": 7857 + }, + { + "epoch": 0.4324949089107821, + "grad_norm": 0.680144727230072, + "learning_rate": 8.911125684260042e-06, + "loss": 0.7156, + "step": 7858 + }, + { + "epoch": 0.43254994771313776, + "grad_norm": 0.7738325595855713, + "learning_rate": 8.910855621657228e-06, + "loss": 0.7295, + "step": 7859 + }, + { + "epoch": 0.43260498651549345, + "grad_norm": 0.7276971340179443, + "learning_rate": 8.910585529661469e-06, + "loss": 0.7982, + "step": 7860 + }, + { + "epoch": 0.4326600253178491, + "grad_norm": 0.7655037641525269, + "learning_rate": 8.910315408274796e-06, + "loss": 0.8416, + "step": 7861 + }, + { + "epoch": 0.4327150641202047, + "grad_norm": 0.7220892906188965, + "learning_rate": 8.910045257499238e-06, + "loss": 0.8002, + "step": 7862 + }, + { + "epoch": 0.4327701029225604, + "grad_norm": 0.6255655884742737, + "learning_rate": 8.90977507733683e-06, + "loss": 0.6477, + "step": 7863 + }, + { + "epoch": 0.43282514172491604, + "grad_norm": 0.649472713470459, + "learning_rate": 8.909504867789594e-06, + "loss": 0.6838, + "step": 7864 + }, + { + "epoch": 0.43288018052727173, + "grad_norm": 0.6915234923362732, + "learning_rate": 8.909234628859568e-06, + "loss": 0.7146, + "step": 7865 + }, + { + "epoch": 0.43293521932962736, + "grad_norm": 0.7120145559310913, + "learning_rate": 8.908964360548783e-06, + "loss": 0.7782, + "step": 7866 + }, + { + "epoch": 0.43299025813198305, + "grad_norm": 0.8125410079956055, + "learning_rate": 8.908694062859267e-06, + "loss": 0.7514, + "step": 7867 + }, + { + "epoch": 0.4330452969343387, + "grad_norm": 0.6821436882019043, + "learning_rate": 8.908423735793053e-06, + "loss": 0.8074, + "step": 7868 + }, + { + "epoch": 0.4331003357366944, + "grad_norm": 0.8079590201377869, + "learning_rate": 8.908153379352171e-06, + "loss": 0.7932, + "step": 7869 + }, + { + "epoch": 0.43315537453905, + "grad_norm": 0.676013708114624, + "learning_rate": 8.907882993538655e-06, + "loss": 0.6611, + "step": 7870 + }, + { + "epoch": 0.4332104133414057, + "grad_norm": 0.706624448299408, + "learning_rate": 8.907612578354537e-06, + "loss": 0.8241, + "step": 7871 + }, + { + "epoch": 0.43326545214376133, + "grad_norm": 0.6533300876617432, + "learning_rate": 8.907342133801848e-06, + "loss": 0.6969, + "step": 7872 + }, + { + "epoch": 0.433320490946117, + "grad_norm": 0.6778282523155212, + "learning_rate": 8.907071659882622e-06, + "loss": 0.6877, + "step": 7873 + }, + { + "epoch": 0.43337552974847265, + "grad_norm": 0.7068879008293152, + "learning_rate": 8.906801156598892e-06, + "loss": 0.7912, + "step": 7874 + }, + { + "epoch": 0.43343056855082834, + "grad_norm": 0.6620263457298279, + "learning_rate": 8.90653062395269e-06, + "loss": 0.7317, + "step": 7875 + }, + { + "epoch": 0.433485607353184, + "grad_norm": 0.7084807753562927, + "learning_rate": 8.906260061946049e-06, + "loss": 0.7268, + "step": 7876 + }, + { + "epoch": 0.43354064615553967, + "grad_norm": 0.7899147272109985, + "learning_rate": 8.905989470581003e-06, + "loss": 0.8258, + "step": 7877 + }, + { + "epoch": 0.4335956849578953, + "grad_norm": 0.6657128930091858, + "learning_rate": 8.905718849859585e-06, + "loss": 0.6564, + "step": 7878 + }, + { + "epoch": 0.433650723760251, + "grad_norm": 0.8737723231315613, + "learning_rate": 8.905448199783831e-06, + "loss": 0.8646, + "step": 7879 + }, + { + "epoch": 0.4337057625626066, + "grad_norm": 0.7517673969268799, + "learning_rate": 8.905177520355775e-06, + "loss": 0.7658, + "step": 7880 + }, + { + "epoch": 0.4337608013649623, + "grad_norm": 0.6724270582199097, + "learning_rate": 8.904906811577447e-06, + "loss": 0.7509, + "step": 7881 + }, + { + "epoch": 0.43381584016731795, + "grad_norm": 0.6490511894226074, + "learning_rate": 8.904636073450885e-06, + "loss": 0.7282, + "step": 7882 + }, + { + "epoch": 0.43387087896967363, + "grad_norm": 0.73885178565979, + "learning_rate": 8.904365305978126e-06, + "loss": 0.7575, + "step": 7883 + }, + { + "epoch": 0.43392591777202927, + "grad_norm": 0.6823462843894958, + "learning_rate": 8.9040945091612e-06, + "loss": 0.7566, + "step": 7884 + }, + { + "epoch": 0.43398095657438496, + "grad_norm": 0.6705971956253052, + "learning_rate": 8.903823683002146e-06, + "loss": 0.7726, + "step": 7885 + }, + { + "epoch": 0.4340359953767406, + "grad_norm": 0.6898428201675415, + "learning_rate": 8.903552827502998e-06, + "loss": 0.7545, + "step": 7886 + }, + { + "epoch": 0.4340910341790963, + "grad_norm": 0.810357928276062, + "learning_rate": 8.90328194266579e-06, + "loss": 0.8883, + "step": 7887 + }, + { + "epoch": 0.4341460729814519, + "grad_norm": 0.6505162119865417, + "learning_rate": 8.903011028492563e-06, + "loss": 0.7205, + "step": 7888 + }, + { + "epoch": 0.4342011117838076, + "grad_norm": 0.8401693105697632, + "learning_rate": 8.902740084985348e-06, + "loss": 0.8105, + "step": 7889 + }, + { + "epoch": 0.43425615058616324, + "grad_norm": 0.7151880860328674, + "learning_rate": 8.902469112146183e-06, + "loss": 0.7748, + "step": 7890 + }, + { + "epoch": 0.4343111893885189, + "grad_norm": 0.7257007956504822, + "learning_rate": 8.902198109977107e-06, + "loss": 0.7818, + "step": 7891 + }, + { + "epoch": 0.43436622819087456, + "grad_norm": 0.786691427230835, + "learning_rate": 8.901927078480153e-06, + "loss": 0.8527, + "step": 7892 + }, + { + "epoch": 0.43442126699323025, + "grad_norm": 0.7420910596847534, + "learning_rate": 8.901656017657358e-06, + "loss": 0.7087, + "step": 7893 + }, + { + "epoch": 0.4344763057955859, + "grad_norm": 0.6713958978652954, + "learning_rate": 8.901384927510763e-06, + "loss": 0.7366, + "step": 7894 + }, + { + "epoch": 0.43453134459794157, + "grad_norm": 1.0276658535003662, + "learning_rate": 8.901113808042402e-06, + "loss": 0.7462, + "step": 7895 + }, + { + "epoch": 0.4345863834002972, + "grad_norm": 0.7207444906234741, + "learning_rate": 8.900842659254314e-06, + "loss": 0.6777, + "step": 7896 + }, + { + "epoch": 0.4346414222026529, + "grad_norm": 0.7581979036331177, + "learning_rate": 8.900571481148538e-06, + "loss": 0.8081, + "step": 7897 + }, + { + "epoch": 0.4346964610050085, + "grad_norm": 0.9224075675010681, + "learning_rate": 8.90030027372711e-06, + "loss": 0.892, + "step": 7898 + }, + { + "epoch": 0.4347514998073642, + "grad_norm": 0.6844260096549988, + "learning_rate": 8.900029036992069e-06, + "loss": 0.8063, + "step": 7899 + }, + { + "epoch": 0.43480653860971985, + "grad_norm": 0.7008691430091858, + "learning_rate": 8.899757770945453e-06, + "loss": 0.6998, + "step": 7900 + }, + { + "epoch": 0.43486157741207554, + "grad_norm": 0.7311949729919434, + "learning_rate": 8.899486475589303e-06, + "loss": 0.7724, + "step": 7901 + }, + { + "epoch": 0.4349166162144312, + "grad_norm": 0.7441468238830566, + "learning_rate": 8.899215150925656e-06, + "loss": 0.7728, + "step": 7902 + }, + { + "epoch": 0.43497165501678686, + "grad_norm": 0.7405179142951965, + "learning_rate": 8.89894379695655e-06, + "loss": 0.8267, + "step": 7903 + }, + { + "epoch": 0.4350266938191425, + "grad_norm": 0.6967620253562927, + "learning_rate": 8.898672413684029e-06, + "loss": 0.7284, + "step": 7904 + }, + { + "epoch": 0.43508173262149813, + "grad_norm": 0.8979219794273376, + "learning_rate": 8.898401001110127e-06, + "loss": 0.8267, + "step": 7905 + }, + { + "epoch": 0.4351367714238538, + "grad_norm": 0.7905356884002686, + "learning_rate": 8.898129559236888e-06, + "loss": 0.8011, + "step": 7906 + }, + { + "epoch": 0.43519181022620945, + "grad_norm": 0.6740859150886536, + "learning_rate": 8.897858088066351e-06, + "loss": 0.6597, + "step": 7907 + }, + { + "epoch": 0.43524684902856514, + "grad_norm": 0.7451572418212891, + "learning_rate": 8.897586587600555e-06, + "loss": 0.7466, + "step": 7908 + }, + { + "epoch": 0.4353018878309208, + "grad_norm": 0.7726565003395081, + "learning_rate": 8.897315057841542e-06, + "loss": 0.7873, + "step": 7909 + }, + { + "epoch": 0.43535692663327646, + "grad_norm": 0.8348171710968018, + "learning_rate": 8.897043498791354e-06, + "loss": 0.7583, + "step": 7910 + }, + { + "epoch": 0.4354119654356321, + "grad_norm": 0.6714087724685669, + "learning_rate": 8.896771910452027e-06, + "loss": 0.7909, + "step": 7911 + }, + { + "epoch": 0.4354670042379878, + "grad_norm": 0.7397969365119934, + "learning_rate": 8.896500292825607e-06, + "loss": 0.7734, + "step": 7912 + }, + { + "epoch": 0.4355220430403434, + "grad_norm": 0.6806391477584839, + "learning_rate": 8.896228645914133e-06, + "loss": 0.7898, + "step": 7913 + }, + { + "epoch": 0.4355770818426991, + "grad_norm": 0.7135224342346191, + "learning_rate": 8.89595696971965e-06, + "loss": 0.7453, + "step": 7914 + }, + { + "epoch": 0.43563212064505474, + "grad_norm": 0.8275992274284363, + "learning_rate": 8.895685264244195e-06, + "loss": 0.7326, + "step": 7915 + }, + { + "epoch": 0.43568715944741043, + "grad_norm": 0.7254159450531006, + "learning_rate": 8.895413529489813e-06, + "loss": 0.7523, + "step": 7916 + }, + { + "epoch": 0.43574219824976607, + "grad_norm": 0.8060647249221802, + "learning_rate": 8.895141765458546e-06, + "loss": 0.7878, + "step": 7917 + }, + { + "epoch": 0.43579723705212176, + "grad_norm": 0.7007316946983337, + "learning_rate": 8.894869972152435e-06, + "loss": 0.7837, + "step": 7918 + }, + { + "epoch": 0.4358522758544774, + "grad_norm": 0.6874841451644897, + "learning_rate": 8.894598149573524e-06, + "loss": 0.7773, + "step": 7919 + }, + { + "epoch": 0.4359073146568331, + "grad_norm": 0.7557696104049683, + "learning_rate": 8.894326297723856e-06, + "loss": 0.6905, + "step": 7920 + }, + { + "epoch": 0.4359623534591887, + "grad_norm": 0.7589512467384338, + "learning_rate": 8.894054416605475e-06, + "loss": 0.8292, + "step": 7921 + }, + { + "epoch": 0.4360173922615444, + "grad_norm": 0.9062818884849548, + "learning_rate": 8.893782506220424e-06, + "loss": 0.9149, + "step": 7922 + }, + { + "epoch": 0.43607243106390003, + "grad_norm": 0.7553420662879944, + "learning_rate": 8.893510566570744e-06, + "loss": 0.7256, + "step": 7923 + }, + { + "epoch": 0.4361274698662557, + "grad_norm": 0.7130489349365234, + "learning_rate": 8.89323859765848e-06, + "loss": 0.7375, + "step": 7924 + }, + { + "epoch": 0.43618250866861136, + "grad_norm": 0.6234793066978455, + "learning_rate": 8.89296659948568e-06, + "loss": 0.716, + "step": 7925 + }, + { + "epoch": 0.43623754747096705, + "grad_norm": 0.7527539134025574, + "learning_rate": 8.892694572054383e-06, + "loss": 0.7884, + "step": 7926 + }, + { + "epoch": 0.4362925862733227, + "grad_norm": 0.7677647471427917, + "learning_rate": 8.892422515366636e-06, + "loss": 0.7136, + "step": 7927 + }, + { + "epoch": 0.43634762507567837, + "grad_norm": 0.7212143540382385, + "learning_rate": 8.892150429424484e-06, + "loss": 0.8113, + "step": 7928 + }, + { + "epoch": 0.436402663878034, + "grad_norm": 0.6735568046569824, + "learning_rate": 8.89187831422997e-06, + "loss": 0.6472, + "step": 7929 + }, + { + "epoch": 0.4364577026803897, + "grad_norm": 0.7120702862739563, + "learning_rate": 8.891606169785141e-06, + "loss": 0.8032, + "step": 7930 + }, + { + "epoch": 0.4365127414827453, + "grad_norm": 0.679499089717865, + "learning_rate": 8.891333996092041e-06, + "loss": 0.7366, + "step": 7931 + }, + { + "epoch": 0.436567780285101, + "grad_norm": 0.7774114012718201, + "learning_rate": 8.891061793152718e-06, + "loss": 0.7917, + "step": 7932 + }, + { + "epoch": 0.43662281908745665, + "grad_norm": 0.6951174139976501, + "learning_rate": 8.890789560969216e-06, + "loss": 0.7518, + "step": 7933 + }, + { + "epoch": 0.43667785788981234, + "grad_norm": 0.7645227909088135, + "learning_rate": 8.89051729954358e-06, + "loss": 0.7787, + "step": 7934 + }, + { + "epoch": 0.43673289669216797, + "grad_norm": 0.7127084732055664, + "learning_rate": 8.890245008877857e-06, + "loss": 0.8137, + "step": 7935 + }, + { + "epoch": 0.43678793549452366, + "grad_norm": 0.7541413903236389, + "learning_rate": 8.889972688974095e-06, + "loss": 0.776, + "step": 7936 + }, + { + "epoch": 0.4368429742968793, + "grad_norm": 0.690963625907898, + "learning_rate": 8.889700339834339e-06, + "loss": 0.7691, + "step": 7937 + }, + { + "epoch": 0.436898013099235, + "grad_norm": 0.750221848487854, + "learning_rate": 8.889427961460636e-06, + "loss": 0.7831, + "step": 7938 + }, + { + "epoch": 0.4369530519015906, + "grad_norm": 0.7255545854568481, + "learning_rate": 8.889155553855035e-06, + "loss": 0.7831, + "step": 7939 + }, + { + "epoch": 0.4370080907039463, + "grad_norm": 0.7187026143074036, + "learning_rate": 8.88888311701958e-06, + "loss": 0.792, + "step": 7940 + }, + { + "epoch": 0.43706312950630194, + "grad_norm": 0.8313350081443787, + "learning_rate": 8.888610650956322e-06, + "loss": 0.706, + "step": 7941 + }, + { + "epoch": 0.43711816830865763, + "grad_norm": 0.8083454370498657, + "learning_rate": 8.888338155667307e-06, + "loss": 0.7857, + "step": 7942 + }, + { + "epoch": 0.43717320711101326, + "grad_norm": 0.8200840353965759, + "learning_rate": 8.888065631154583e-06, + "loss": 0.8601, + "step": 7943 + }, + { + "epoch": 0.43722824591336895, + "grad_norm": 0.7503816485404968, + "learning_rate": 8.887793077420198e-06, + "loss": 0.7744, + "step": 7944 + }, + { + "epoch": 0.4372832847157246, + "grad_norm": 0.7466493248939514, + "learning_rate": 8.887520494466202e-06, + "loss": 0.7818, + "step": 7945 + }, + { + "epoch": 0.4373383235180803, + "grad_norm": 0.728118360042572, + "learning_rate": 8.887247882294641e-06, + "loss": 0.7157, + "step": 7946 + }, + { + "epoch": 0.4373933623204359, + "grad_norm": 0.9199670553207397, + "learning_rate": 8.886975240907568e-06, + "loss": 0.8283, + "step": 7947 + }, + { + "epoch": 0.43744840112279154, + "grad_norm": 0.735584557056427, + "learning_rate": 8.886702570307027e-06, + "loss": 0.6588, + "step": 7948 + }, + { + "epoch": 0.43750343992514723, + "grad_norm": 0.8619036674499512, + "learning_rate": 8.886429870495072e-06, + "loss": 0.7269, + "step": 7949 + }, + { + "epoch": 0.43755847872750286, + "grad_norm": 0.7304830551147461, + "learning_rate": 8.886157141473747e-06, + "loss": 0.6725, + "step": 7950 + }, + { + "epoch": 0.43761351752985855, + "grad_norm": 0.7669086456298828, + "learning_rate": 8.885884383245109e-06, + "loss": 0.6957, + "step": 7951 + }, + { + "epoch": 0.4376685563322142, + "grad_norm": 0.7558299899101257, + "learning_rate": 8.885611595811203e-06, + "loss": 0.8159, + "step": 7952 + }, + { + "epoch": 0.4377235951345699, + "grad_norm": 0.7661786079406738, + "learning_rate": 8.88533877917408e-06, + "loss": 0.764, + "step": 7953 + }, + { + "epoch": 0.4377786339369255, + "grad_norm": 0.7461101412773132, + "learning_rate": 8.88506593333579e-06, + "loss": 0.7544, + "step": 7954 + }, + { + "epoch": 0.4378336727392812, + "grad_norm": 0.7989180088043213, + "learning_rate": 8.884793058298387e-06, + "loss": 0.6913, + "step": 7955 + }, + { + "epoch": 0.43788871154163683, + "grad_norm": 0.7964022755622864, + "learning_rate": 8.884520154063917e-06, + "loss": 0.7339, + "step": 7956 + }, + { + "epoch": 0.4379437503439925, + "grad_norm": 0.7278034687042236, + "learning_rate": 8.884247220634433e-06, + "loss": 0.8477, + "step": 7957 + }, + { + "epoch": 0.43799878914634816, + "grad_norm": 0.7294753789901733, + "learning_rate": 8.883974258011988e-06, + "loss": 0.8412, + "step": 7958 + }, + { + "epoch": 0.43805382794870384, + "grad_norm": 0.665734589099884, + "learning_rate": 8.88370126619863e-06, + "loss": 0.7838, + "step": 7959 + }, + { + "epoch": 0.4381088667510595, + "grad_norm": 0.6984216570854187, + "learning_rate": 8.883428245196414e-06, + "loss": 0.7657, + "step": 7960 + }, + { + "epoch": 0.43816390555341517, + "grad_norm": 0.8048402070999146, + "learning_rate": 8.883155195007393e-06, + "loss": 0.7553, + "step": 7961 + }, + { + "epoch": 0.4382189443557708, + "grad_norm": 0.7145794630050659, + "learning_rate": 8.882882115633616e-06, + "loss": 0.6583, + "step": 7962 + }, + { + "epoch": 0.4382739831581265, + "grad_norm": 0.7073546648025513, + "learning_rate": 8.882609007077135e-06, + "loss": 0.7869, + "step": 7963 + }, + { + "epoch": 0.4383290219604821, + "grad_norm": 0.8300859928131104, + "learning_rate": 8.882335869340004e-06, + "loss": 0.773, + "step": 7964 + }, + { + "epoch": 0.4383840607628378, + "grad_norm": 0.8343188762664795, + "learning_rate": 8.882062702424276e-06, + "loss": 0.6743, + "step": 7965 + }, + { + "epoch": 0.43843909956519345, + "grad_norm": 0.7106530666351318, + "learning_rate": 8.881789506332007e-06, + "loss": 0.7414, + "step": 7966 + }, + { + "epoch": 0.43849413836754914, + "grad_norm": 0.7015630602836609, + "learning_rate": 8.881516281065244e-06, + "loss": 0.7434, + "step": 7967 + }, + { + "epoch": 0.43854917716990477, + "grad_norm": 0.8106673955917358, + "learning_rate": 8.881243026626044e-06, + "loss": 0.7741, + "step": 7968 + }, + { + "epoch": 0.43860421597226046, + "grad_norm": 0.8181495070457458, + "learning_rate": 8.88096974301646e-06, + "loss": 0.8046, + "step": 7969 + }, + { + "epoch": 0.4386592547746161, + "grad_norm": 0.7767857313156128, + "learning_rate": 8.880696430238546e-06, + "loss": 0.8586, + "step": 7970 + }, + { + "epoch": 0.4387142935769718, + "grad_norm": 0.7257522940635681, + "learning_rate": 8.880423088294359e-06, + "loss": 0.7799, + "step": 7971 + }, + { + "epoch": 0.4387693323793274, + "grad_norm": 0.6896021366119385, + "learning_rate": 8.880149717185948e-06, + "loss": 0.8178, + "step": 7972 + }, + { + "epoch": 0.4388243711816831, + "grad_norm": 0.7646406292915344, + "learning_rate": 8.879876316915372e-06, + "loss": 0.8754, + "step": 7973 + }, + { + "epoch": 0.43887940998403874, + "grad_norm": 0.8043848872184753, + "learning_rate": 8.879602887484684e-06, + "loss": 0.8562, + "step": 7974 + }, + { + "epoch": 0.4389344487863944, + "grad_norm": 0.6727305054664612, + "learning_rate": 8.879329428895937e-06, + "loss": 0.6168, + "step": 7975 + }, + { + "epoch": 0.43898948758875006, + "grad_norm": 0.7634731531143188, + "learning_rate": 8.87905594115119e-06, + "loss": 0.857, + "step": 7976 + }, + { + "epoch": 0.43904452639110575, + "grad_norm": 0.6544492244720459, + "learning_rate": 8.878782424252497e-06, + "loss": 0.6302, + "step": 7977 + }, + { + "epoch": 0.4390995651934614, + "grad_norm": 0.8126636743545532, + "learning_rate": 8.878508878201915e-06, + "loss": 0.7823, + "step": 7978 + }, + { + "epoch": 0.43915460399581707, + "grad_norm": 0.7235779166221619, + "learning_rate": 8.878235303001497e-06, + "loss": 0.7527, + "step": 7979 + }, + { + "epoch": 0.4392096427981727, + "grad_norm": 0.6961055397987366, + "learning_rate": 8.8779616986533e-06, + "loss": 0.7383, + "step": 7980 + }, + { + "epoch": 0.4392646816005284, + "grad_norm": 0.7684490084648132, + "learning_rate": 8.877688065159382e-06, + "loss": 0.8009, + "step": 7981 + }, + { + "epoch": 0.43931972040288403, + "grad_norm": 0.7897803783416748, + "learning_rate": 8.877414402521797e-06, + "loss": 0.7561, + "step": 7982 + }, + { + "epoch": 0.4393747592052397, + "grad_norm": 0.7877688407897949, + "learning_rate": 8.877140710742606e-06, + "loss": 0.7949, + "step": 7983 + }, + { + "epoch": 0.43942979800759535, + "grad_norm": 0.8341611623764038, + "learning_rate": 8.876866989823862e-06, + "loss": 0.7585, + "step": 7984 + }, + { + "epoch": 0.43948483680995104, + "grad_norm": 0.7663636207580566, + "learning_rate": 8.876593239767622e-06, + "loss": 0.771, + "step": 7985 + }, + { + "epoch": 0.4395398756123067, + "grad_norm": 0.6824129223823547, + "learning_rate": 8.876319460575946e-06, + "loss": 0.7852, + "step": 7986 + }, + { + "epoch": 0.43959491441466236, + "grad_norm": 0.6533854007720947, + "learning_rate": 8.876045652250891e-06, + "loss": 0.723, + "step": 7987 + }, + { + "epoch": 0.439649953217018, + "grad_norm": 0.7174259424209595, + "learning_rate": 8.875771814794515e-06, + "loss": 0.749, + "step": 7988 + }, + { + "epoch": 0.4397049920193737, + "grad_norm": 0.8585928678512573, + "learning_rate": 8.875497948208875e-06, + "loss": 0.6727, + "step": 7989 + }, + { + "epoch": 0.4397600308217293, + "grad_norm": 0.7558062672615051, + "learning_rate": 8.875224052496029e-06, + "loss": 0.7929, + "step": 7990 + }, + { + "epoch": 0.43981506962408495, + "grad_norm": 0.7063853144645691, + "learning_rate": 8.874950127658037e-06, + "loss": 0.7397, + "step": 7991 + }, + { + "epoch": 0.43987010842644064, + "grad_norm": 0.7165526747703552, + "learning_rate": 8.874676173696956e-06, + "loss": 0.7678, + "step": 7992 + }, + { + "epoch": 0.4399251472287963, + "grad_norm": 0.7657830715179443, + "learning_rate": 8.874402190614847e-06, + "loss": 0.8318, + "step": 7993 + }, + { + "epoch": 0.43998018603115197, + "grad_norm": 0.7776834964752197, + "learning_rate": 8.874128178413769e-06, + "loss": 0.8589, + "step": 7994 + }, + { + "epoch": 0.4400352248335076, + "grad_norm": 0.6805633306503296, + "learning_rate": 8.873854137095778e-06, + "loss": 0.7009, + "step": 7995 + }, + { + "epoch": 0.4400902636358633, + "grad_norm": 0.6962490677833557, + "learning_rate": 8.87358006666294e-06, + "loss": 0.7896, + "step": 7996 + }, + { + "epoch": 0.4401453024382189, + "grad_norm": 0.611610472202301, + "learning_rate": 8.873305967117307e-06, + "loss": 0.5993, + "step": 7997 + }, + { + "epoch": 0.4402003412405746, + "grad_norm": 0.7442964911460876, + "learning_rate": 8.873031838460946e-06, + "loss": 0.8277, + "step": 7998 + }, + { + "epoch": 0.44025538004293024, + "grad_norm": 0.6858734488487244, + "learning_rate": 8.872757680695914e-06, + "loss": 0.8064, + "step": 7999 + }, + { + "epoch": 0.44031041884528593, + "grad_norm": 0.6654849052429199, + "learning_rate": 8.872483493824273e-06, + "loss": 0.7408, + "step": 8000 + }, + { + "epoch": 0.44036545764764157, + "grad_norm": 0.8241575956344604, + "learning_rate": 8.87220927784808e-06, + "loss": 0.8819, + "step": 8001 + }, + { + "epoch": 0.44042049644999726, + "grad_norm": 0.7078573107719421, + "learning_rate": 8.8719350327694e-06, + "loss": 0.7709, + "step": 8002 + }, + { + "epoch": 0.4404755352523529, + "grad_norm": 0.7369210720062256, + "learning_rate": 8.871660758590292e-06, + "loss": 0.7867, + "step": 8003 + }, + { + "epoch": 0.4405305740547086, + "grad_norm": 0.7206673622131348, + "learning_rate": 8.87138645531282e-06, + "loss": 0.8697, + "step": 8004 + }, + { + "epoch": 0.4405856128570642, + "grad_norm": 0.8370183706283569, + "learning_rate": 8.871112122939041e-06, + "loss": 0.7201, + "step": 8005 + }, + { + "epoch": 0.4406406516594199, + "grad_norm": 0.8015196323394775, + "learning_rate": 8.870837761471023e-06, + "loss": 0.774, + "step": 8006 + }, + { + "epoch": 0.44069569046177554, + "grad_norm": 0.730185329914093, + "learning_rate": 8.870563370910821e-06, + "loss": 0.7153, + "step": 8007 + }, + { + "epoch": 0.4407507292641312, + "grad_norm": 0.6719930768013, + "learning_rate": 8.870288951260503e-06, + "loss": 0.7949, + "step": 8008 + }, + { + "epoch": 0.44080576806648686, + "grad_norm": 0.7614291906356812, + "learning_rate": 8.870014502522128e-06, + "loss": 0.7143, + "step": 8009 + }, + { + "epoch": 0.44086080686884255, + "grad_norm": 0.7438056468963623, + "learning_rate": 8.86974002469776e-06, + "loss": 0.6859, + "step": 8010 + }, + { + "epoch": 0.4409158456711982, + "grad_norm": 0.759903073310852, + "learning_rate": 8.869465517789463e-06, + "loss": 0.8095, + "step": 8011 + }, + { + "epoch": 0.44097088447355387, + "grad_norm": 0.7622823119163513, + "learning_rate": 8.869190981799298e-06, + "loss": 0.786, + "step": 8012 + }, + { + "epoch": 0.4410259232759095, + "grad_norm": 0.677003800868988, + "learning_rate": 8.86891641672933e-06, + "loss": 0.7074, + "step": 8013 + }, + { + "epoch": 0.4410809620782652, + "grad_norm": 0.9258451461791992, + "learning_rate": 8.86864182258162e-06, + "loss": 0.7218, + "step": 8014 + }, + { + "epoch": 0.4411360008806208, + "grad_norm": 0.7027828693389893, + "learning_rate": 8.868367199358236e-06, + "loss": 0.7654, + "step": 8015 + }, + { + "epoch": 0.4411910396829765, + "grad_norm": 0.8279967308044434, + "learning_rate": 8.868092547061239e-06, + "loss": 0.8969, + "step": 8016 + }, + { + "epoch": 0.44124607848533215, + "grad_norm": 0.7366079688072205, + "learning_rate": 8.867817865692693e-06, + "loss": 0.8421, + "step": 8017 + }, + { + "epoch": 0.44130111728768784, + "grad_norm": 0.7548787593841553, + "learning_rate": 8.867543155254665e-06, + "loss": 0.79, + "step": 8018 + }, + { + "epoch": 0.44135615609004347, + "grad_norm": 0.7558487057685852, + "learning_rate": 8.867268415749215e-06, + "loss": 0.8461, + "step": 8019 + }, + { + "epoch": 0.44141119489239916, + "grad_norm": 0.6413403153419495, + "learning_rate": 8.866993647178413e-06, + "loss": 0.6811, + "step": 8020 + }, + { + "epoch": 0.4414662336947548, + "grad_norm": 0.9251089692115784, + "learning_rate": 8.86671884954432e-06, + "loss": 0.868, + "step": 8021 + }, + { + "epoch": 0.4415212724971105, + "grad_norm": 0.7920099496841431, + "learning_rate": 8.866444022849006e-06, + "loss": 0.8131, + "step": 8022 + }, + { + "epoch": 0.4415763112994661, + "grad_norm": 0.8738380670547485, + "learning_rate": 8.866169167094532e-06, + "loss": 0.857, + "step": 8023 + }, + { + "epoch": 0.4416313501018218, + "grad_norm": 0.7181336283683777, + "learning_rate": 8.865894282282965e-06, + "loss": 0.7869, + "step": 8024 + }, + { + "epoch": 0.44168638890417744, + "grad_norm": 0.8003776669502258, + "learning_rate": 8.865619368416373e-06, + "loss": 0.8874, + "step": 8025 + }, + { + "epoch": 0.44174142770653313, + "grad_norm": 0.7186623215675354, + "learning_rate": 8.86534442549682e-06, + "loss": 0.7931, + "step": 8026 + }, + { + "epoch": 0.44179646650888876, + "grad_norm": 0.7006831765174866, + "learning_rate": 8.865069453526371e-06, + "loss": 0.7046, + "step": 8027 + }, + { + "epoch": 0.44185150531124445, + "grad_norm": 0.7394786477088928, + "learning_rate": 8.864794452507097e-06, + "loss": 0.685, + "step": 8028 + }, + { + "epoch": 0.4419065441136001, + "grad_norm": 0.7512097358703613, + "learning_rate": 8.864519422441062e-06, + "loss": 0.8047, + "step": 8029 + }, + { + "epoch": 0.4419615829159558, + "grad_norm": 0.6866902709007263, + "learning_rate": 8.864244363330333e-06, + "loss": 0.7099, + "step": 8030 + }, + { + "epoch": 0.4420166217183114, + "grad_norm": 0.7316723465919495, + "learning_rate": 8.863969275176978e-06, + "loss": 0.7767, + "step": 8031 + }, + { + "epoch": 0.4420716605206671, + "grad_norm": 0.7103593349456787, + "learning_rate": 8.863694157983064e-06, + "loss": 0.7832, + "step": 8032 + }, + { + "epoch": 0.44212669932302273, + "grad_norm": 0.6922749876976013, + "learning_rate": 8.863419011750659e-06, + "loss": 0.7833, + "step": 8033 + }, + { + "epoch": 0.44218173812537837, + "grad_norm": 0.7989425659179688, + "learning_rate": 8.863143836481831e-06, + "loss": 0.8651, + "step": 8034 + }, + { + "epoch": 0.44223677692773405, + "grad_norm": 0.6765440702438354, + "learning_rate": 8.862868632178648e-06, + "loss": 0.7858, + "step": 8035 + }, + { + "epoch": 0.4422918157300897, + "grad_norm": 0.670767068862915, + "learning_rate": 8.862593398843178e-06, + "loss": 0.6789, + "step": 8036 + }, + { + "epoch": 0.4423468545324454, + "grad_norm": 0.7556853294372559, + "learning_rate": 8.86231813647749e-06, + "loss": 0.8036, + "step": 8037 + }, + { + "epoch": 0.442401893334801, + "grad_norm": 0.788690984249115, + "learning_rate": 8.862042845083654e-06, + "loss": 0.8355, + "step": 8038 + }, + { + "epoch": 0.4424569321371567, + "grad_norm": 0.8439056873321533, + "learning_rate": 8.861767524663736e-06, + "loss": 0.7327, + "step": 8039 + }, + { + "epoch": 0.44251197093951233, + "grad_norm": 0.7101821899414062, + "learning_rate": 8.861492175219808e-06, + "loss": 0.8303, + "step": 8040 + }, + { + "epoch": 0.442567009741868, + "grad_norm": 0.741680383682251, + "learning_rate": 8.861216796753937e-06, + "loss": 0.7377, + "step": 8041 + }, + { + "epoch": 0.44262204854422366, + "grad_norm": 0.7588099837303162, + "learning_rate": 8.860941389268196e-06, + "loss": 0.8217, + "step": 8042 + }, + { + "epoch": 0.44267708734657935, + "grad_norm": 0.7654829025268555, + "learning_rate": 8.860665952764654e-06, + "loss": 0.8416, + "step": 8043 + }, + { + "epoch": 0.442732126148935, + "grad_norm": 0.7025987505912781, + "learning_rate": 8.860390487245378e-06, + "loss": 0.7312, + "step": 8044 + }, + { + "epoch": 0.44278716495129067, + "grad_norm": 0.7206251621246338, + "learning_rate": 8.860114992712441e-06, + "loss": 0.7522, + "step": 8045 + }, + { + "epoch": 0.4428422037536463, + "grad_norm": 0.7041749954223633, + "learning_rate": 8.859839469167912e-06, + "loss": 0.746, + "step": 8046 + }, + { + "epoch": 0.442897242556002, + "grad_norm": 0.6941862106323242, + "learning_rate": 8.859563916613864e-06, + "loss": 0.7692, + "step": 8047 + }, + { + "epoch": 0.4429522813583576, + "grad_norm": 0.6897740364074707, + "learning_rate": 8.859288335052367e-06, + "loss": 0.7963, + "step": 8048 + }, + { + "epoch": 0.4430073201607133, + "grad_norm": 0.6744545698165894, + "learning_rate": 8.859012724485492e-06, + "loss": 0.7647, + "step": 8049 + }, + { + "epoch": 0.44306235896306895, + "grad_norm": 0.7899364829063416, + "learning_rate": 8.858737084915309e-06, + "loss": 0.8373, + "step": 8050 + }, + { + "epoch": 0.44311739776542464, + "grad_norm": 0.806016743183136, + "learning_rate": 8.85846141634389e-06, + "loss": 0.7871, + "step": 8051 + }, + { + "epoch": 0.44317243656778027, + "grad_norm": 0.7444993257522583, + "learning_rate": 8.85818571877331e-06, + "loss": 0.8099, + "step": 8052 + }, + { + "epoch": 0.44322747537013596, + "grad_norm": 0.772735059261322, + "learning_rate": 8.85790999220564e-06, + "loss": 0.7113, + "step": 8053 + }, + { + "epoch": 0.4432825141724916, + "grad_norm": 0.7743984460830688, + "learning_rate": 8.85763423664295e-06, + "loss": 0.8935, + "step": 8054 + }, + { + "epoch": 0.4433375529748473, + "grad_norm": 0.6751214265823364, + "learning_rate": 8.857358452087313e-06, + "loss": 0.6769, + "step": 8055 + }, + { + "epoch": 0.4433925917772029, + "grad_norm": 0.6921005845069885, + "learning_rate": 8.857082638540803e-06, + "loss": 0.7071, + "step": 8056 + }, + { + "epoch": 0.4434476305795586, + "grad_norm": 0.7884092330932617, + "learning_rate": 8.856806796005491e-06, + "loss": 0.7919, + "step": 8057 + }, + { + "epoch": 0.44350266938191424, + "grad_norm": 0.6522679924964905, + "learning_rate": 8.856530924483452e-06, + "loss": 0.7449, + "step": 8058 + }, + { + "epoch": 0.4435577081842699, + "grad_norm": 0.7172590494155884, + "learning_rate": 8.85625502397676e-06, + "loss": 0.7306, + "step": 8059 + }, + { + "epoch": 0.44361274698662556, + "grad_norm": 0.698658287525177, + "learning_rate": 8.855979094487488e-06, + "loss": 0.803, + "step": 8060 + }, + { + "epoch": 0.44366778578898125, + "grad_norm": 0.685589075088501, + "learning_rate": 8.855703136017708e-06, + "loss": 0.763, + "step": 8061 + }, + { + "epoch": 0.4437228245913369, + "grad_norm": 0.8259774446487427, + "learning_rate": 8.855427148569495e-06, + "loss": 0.811, + "step": 8062 + }, + { + "epoch": 0.4437778633936926, + "grad_norm": 0.6976660490036011, + "learning_rate": 8.855151132144926e-06, + "loss": 0.7345, + "step": 8063 + }, + { + "epoch": 0.4438329021960482, + "grad_norm": 0.7696738243103027, + "learning_rate": 8.854875086746071e-06, + "loss": 0.823, + "step": 8064 + }, + { + "epoch": 0.4438879409984039, + "grad_norm": 0.6627930998802185, + "learning_rate": 8.854599012375006e-06, + "loss": 0.7455, + "step": 8065 + }, + { + "epoch": 0.44394297980075953, + "grad_norm": 0.7492700815200806, + "learning_rate": 8.854322909033809e-06, + "loss": 0.8195, + "step": 8066 + }, + { + "epoch": 0.4439980186031152, + "grad_norm": 0.8335888981819153, + "learning_rate": 8.85404677672455e-06, + "loss": 0.7683, + "step": 8067 + }, + { + "epoch": 0.44405305740547085, + "grad_norm": 0.7448242902755737, + "learning_rate": 8.853770615449309e-06, + "loss": 0.8352, + "step": 8068 + }, + { + "epoch": 0.44410809620782654, + "grad_norm": 0.700616180896759, + "learning_rate": 8.853494425210158e-06, + "loss": 0.7892, + "step": 8069 + }, + { + "epoch": 0.4441631350101822, + "grad_norm": 0.6959284543991089, + "learning_rate": 8.853218206009176e-06, + "loss": 0.6944, + "step": 8070 + }, + { + "epoch": 0.44421817381253786, + "grad_norm": 0.7507375478744507, + "learning_rate": 8.852941957848438e-06, + "loss": 0.8921, + "step": 8071 + }, + { + "epoch": 0.4442732126148935, + "grad_norm": 0.7843918204307556, + "learning_rate": 8.852665680730019e-06, + "loss": 0.816, + "step": 8072 + }, + { + "epoch": 0.4443282514172492, + "grad_norm": 0.8702702522277832, + "learning_rate": 8.852389374655995e-06, + "loss": 0.8191, + "step": 8073 + }, + { + "epoch": 0.4443832902196048, + "grad_norm": 0.6784317493438721, + "learning_rate": 8.852113039628445e-06, + "loss": 0.7726, + "step": 8074 + }, + { + "epoch": 0.4444383290219605, + "grad_norm": 0.724530041217804, + "learning_rate": 8.851836675649443e-06, + "loss": 0.8214, + "step": 8075 + }, + { + "epoch": 0.44449336782431614, + "grad_norm": 0.9814287424087524, + "learning_rate": 8.851560282721067e-06, + "loss": 0.8368, + "step": 8076 + }, + { + "epoch": 0.4445484066266718, + "grad_norm": 0.6606815457344055, + "learning_rate": 8.851283860845398e-06, + "loss": 0.7772, + "step": 8077 + }, + { + "epoch": 0.44460344542902747, + "grad_norm": 0.6910951137542725, + "learning_rate": 8.851007410024507e-06, + "loss": 0.7007, + "step": 8078 + }, + { + "epoch": 0.4446584842313831, + "grad_norm": 0.6764300465583801, + "learning_rate": 8.850730930260479e-06, + "loss": 0.7265, + "step": 8079 + }, + { + "epoch": 0.4447135230337388, + "grad_norm": 0.669622004032135, + "learning_rate": 8.850454421555386e-06, + "loss": 0.7551, + "step": 8080 + }, + { + "epoch": 0.4447685618360944, + "grad_norm": 0.7068240642547607, + "learning_rate": 8.850177883911307e-06, + "loss": 0.8358, + "step": 8081 + }, + { + "epoch": 0.4448236006384501, + "grad_norm": 0.7100360989570618, + "learning_rate": 8.849901317330324e-06, + "loss": 0.7074, + "step": 8082 + }, + { + "epoch": 0.44487863944080575, + "grad_norm": 0.7510328888893127, + "learning_rate": 8.849624721814511e-06, + "loss": 0.6654, + "step": 8083 + }, + { + "epoch": 0.44493367824316143, + "grad_norm": 0.8106432557106018, + "learning_rate": 8.849348097365951e-06, + "loss": 0.6944, + "step": 8084 + }, + { + "epoch": 0.44498871704551707, + "grad_norm": 0.6852346062660217, + "learning_rate": 8.84907144398672e-06, + "loss": 0.7203, + "step": 8085 + }, + { + "epoch": 0.44504375584787276, + "grad_norm": 0.8495593667030334, + "learning_rate": 8.848794761678898e-06, + "loss": 0.7918, + "step": 8086 + }, + { + "epoch": 0.4450987946502284, + "grad_norm": 0.7110981941223145, + "learning_rate": 8.848518050444565e-06, + "loss": 0.8176, + "step": 8087 + }, + { + "epoch": 0.4451538334525841, + "grad_norm": 0.7740922570228577, + "learning_rate": 8.8482413102858e-06, + "loss": 0.7573, + "step": 8088 + }, + { + "epoch": 0.4452088722549397, + "grad_norm": 0.9645134806632996, + "learning_rate": 8.847964541204685e-06, + "loss": 0.7842, + "step": 8089 + }, + { + "epoch": 0.4452639110572954, + "grad_norm": 0.767621636390686, + "learning_rate": 8.847687743203299e-06, + "loss": 0.8182, + "step": 8090 + }, + { + "epoch": 0.44531894985965104, + "grad_norm": 0.6842975616455078, + "learning_rate": 8.84741091628372e-06, + "loss": 0.7795, + "step": 8091 + }, + { + "epoch": 0.4453739886620067, + "grad_norm": 0.768644392490387, + "learning_rate": 8.847134060448032e-06, + "loss": 0.7363, + "step": 8092 + }, + { + "epoch": 0.44542902746436236, + "grad_norm": 0.6813824772834778, + "learning_rate": 8.846857175698314e-06, + "loss": 0.7601, + "step": 8093 + }, + { + "epoch": 0.44548406626671805, + "grad_norm": 0.8608306646347046, + "learning_rate": 8.846580262036645e-06, + "loss": 0.8205, + "step": 8094 + }, + { + "epoch": 0.4455391050690737, + "grad_norm": 0.6917694807052612, + "learning_rate": 8.84630331946511e-06, + "loss": 0.7207, + "step": 8095 + }, + { + "epoch": 0.44559414387142937, + "grad_norm": 0.6777203679084778, + "learning_rate": 8.84602634798579e-06, + "loss": 0.6939, + "step": 8096 + }, + { + "epoch": 0.445649182673785, + "grad_norm": 0.7249894142150879, + "learning_rate": 8.845749347600764e-06, + "loss": 0.7918, + "step": 8097 + }, + { + "epoch": 0.4457042214761407, + "grad_norm": 0.7446995973587036, + "learning_rate": 8.845472318312116e-06, + "loss": 0.7379, + "step": 8098 + }, + { + "epoch": 0.4457592602784963, + "grad_norm": 0.8245479464530945, + "learning_rate": 8.845195260121927e-06, + "loss": 0.8532, + "step": 8099 + }, + { + "epoch": 0.445814299080852, + "grad_norm": 0.7160329818725586, + "learning_rate": 8.84491817303228e-06, + "loss": 0.7042, + "step": 8100 + }, + { + "epoch": 0.44586933788320765, + "grad_norm": 0.8056026101112366, + "learning_rate": 8.844641057045257e-06, + "loss": 0.8581, + "step": 8101 + }, + { + "epoch": 0.44592437668556334, + "grad_norm": 0.7257886528968811, + "learning_rate": 8.84436391216294e-06, + "loss": 0.7297, + "step": 8102 + }, + { + "epoch": 0.445979415487919, + "grad_norm": 0.7400404810905457, + "learning_rate": 8.844086738387415e-06, + "loss": 0.7703, + "step": 8103 + }, + { + "epoch": 0.44603445429027466, + "grad_norm": 0.665271520614624, + "learning_rate": 8.843809535720763e-06, + "loss": 0.7769, + "step": 8104 + }, + { + "epoch": 0.4460894930926303, + "grad_norm": 0.7041043639183044, + "learning_rate": 8.843532304165066e-06, + "loss": 0.7995, + "step": 8105 + }, + { + "epoch": 0.446144531894986, + "grad_norm": 0.8517841100692749, + "learning_rate": 8.84325504372241e-06, + "loss": 0.8239, + "step": 8106 + }, + { + "epoch": 0.4461995706973416, + "grad_norm": 0.7045741677284241, + "learning_rate": 8.842977754394877e-06, + "loss": 0.7982, + "step": 8107 + }, + { + "epoch": 0.4462546094996973, + "grad_norm": 0.7056185007095337, + "learning_rate": 8.842700436184552e-06, + "loss": 0.8003, + "step": 8108 + }, + { + "epoch": 0.44630964830205294, + "grad_norm": 0.9042232632637024, + "learning_rate": 8.842423089093519e-06, + "loss": 0.7534, + "step": 8109 + }, + { + "epoch": 0.44636468710440863, + "grad_norm": 0.8584854602813721, + "learning_rate": 8.842145713123863e-06, + "loss": 0.7759, + "step": 8110 + }, + { + "epoch": 0.44641972590676426, + "grad_norm": 0.7333530187606812, + "learning_rate": 8.841868308277668e-06, + "loss": 0.7218, + "step": 8111 + }, + { + "epoch": 0.44647476470911995, + "grad_norm": 0.7866941094398499, + "learning_rate": 8.84159087455702e-06, + "loss": 0.7016, + "step": 8112 + }, + { + "epoch": 0.4465298035114756, + "grad_norm": 0.7785252928733826, + "learning_rate": 8.841313411964001e-06, + "loss": 0.8232, + "step": 8113 + }, + { + "epoch": 0.4465848423138313, + "grad_norm": 0.7060698866844177, + "learning_rate": 8.841035920500702e-06, + "loss": 0.6987, + "step": 8114 + }, + { + "epoch": 0.4466398811161869, + "grad_norm": 0.7211717963218689, + "learning_rate": 8.840758400169203e-06, + "loss": 0.8604, + "step": 8115 + }, + { + "epoch": 0.4466949199185426, + "grad_norm": 0.979678213596344, + "learning_rate": 8.840480850971593e-06, + "loss": 0.9028, + "step": 8116 + }, + { + "epoch": 0.44674995872089823, + "grad_norm": 0.6595104336738586, + "learning_rate": 8.840203272909957e-06, + "loss": 0.6899, + "step": 8117 + }, + { + "epoch": 0.4468049975232539, + "grad_norm": 0.6392405033111572, + "learning_rate": 8.83992566598638e-06, + "loss": 0.7729, + "step": 8118 + }, + { + "epoch": 0.44686003632560956, + "grad_norm": 1.1084040403366089, + "learning_rate": 8.839648030202949e-06, + "loss": 0.822, + "step": 8119 + }, + { + "epoch": 0.4469150751279652, + "grad_norm": 0.7024106383323669, + "learning_rate": 8.839370365561754e-06, + "loss": 0.7615, + "step": 8120 + }, + { + "epoch": 0.4469701139303209, + "grad_norm": 0.7204060554504395, + "learning_rate": 8.839092672064878e-06, + "loss": 0.7527, + "step": 8121 + }, + { + "epoch": 0.4470251527326765, + "grad_norm": 0.7307723760604858, + "learning_rate": 8.838814949714407e-06, + "loss": 0.8139, + "step": 8122 + }, + { + "epoch": 0.4470801915350322, + "grad_norm": 0.824034571647644, + "learning_rate": 8.838537198512434e-06, + "loss": 0.8299, + "step": 8123 + }, + { + "epoch": 0.44713523033738783, + "grad_norm": 0.6603747606277466, + "learning_rate": 8.83825941846104e-06, + "loss": 0.6762, + "step": 8124 + }, + { + "epoch": 0.4471902691397435, + "grad_norm": 0.7403088808059692, + "learning_rate": 8.837981609562316e-06, + "loss": 0.716, + "step": 8125 + }, + { + "epoch": 0.44724530794209916, + "grad_norm": 0.742173969745636, + "learning_rate": 8.837703771818351e-06, + "loss": 0.7672, + "step": 8126 + }, + { + "epoch": 0.44730034674445485, + "grad_norm": 0.7158839106559753, + "learning_rate": 8.837425905231232e-06, + "loss": 0.6941, + "step": 8127 + }, + { + "epoch": 0.4473553855468105, + "grad_norm": 0.7659464478492737, + "learning_rate": 8.837148009803044e-06, + "loss": 0.7293, + "step": 8128 + }, + { + "epoch": 0.44741042434916617, + "grad_norm": 0.8681113719940186, + "learning_rate": 8.836870085535882e-06, + "loss": 0.8647, + "step": 8129 + }, + { + "epoch": 0.4474654631515218, + "grad_norm": 0.7117272615432739, + "learning_rate": 8.83659213243183e-06, + "loss": 0.8035, + "step": 8130 + }, + { + "epoch": 0.4475205019538775, + "grad_norm": 0.8220957517623901, + "learning_rate": 8.836314150492978e-06, + "loss": 0.6978, + "step": 8131 + }, + { + "epoch": 0.4475755407562331, + "grad_norm": 0.7045003175735474, + "learning_rate": 8.836036139721418e-06, + "loss": 0.747, + "step": 8132 + }, + { + "epoch": 0.4476305795585888, + "grad_norm": 0.6833191514015198, + "learning_rate": 8.835758100119235e-06, + "loss": 0.7604, + "step": 8133 + }, + { + "epoch": 0.44768561836094445, + "grad_norm": 0.7305697798728943, + "learning_rate": 8.835480031688521e-06, + "loss": 0.7301, + "step": 8134 + }, + { + "epoch": 0.44774065716330014, + "grad_norm": 0.7266964912414551, + "learning_rate": 8.835201934431366e-06, + "loss": 0.7675, + "step": 8135 + }, + { + "epoch": 0.44779569596565577, + "grad_norm": 0.6822015047073364, + "learning_rate": 8.834923808349861e-06, + "loss": 0.8226, + "step": 8136 + }, + { + "epoch": 0.44785073476801146, + "grad_norm": 0.7443515062332153, + "learning_rate": 8.834645653446095e-06, + "loss": 0.9289, + "step": 8137 + }, + { + "epoch": 0.4479057735703671, + "grad_norm": 0.7337210178375244, + "learning_rate": 8.834367469722158e-06, + "loss": 0.7758, + "step": 8138 + }, + { + "epoch": 0.4479608123727228, + "grad_norm": 0.6794925332069397, + "learning_rate": 8.83408925718014e-06, + "loss": 0.8426, + "step": 8139 + }, + { + "epoch": 0.4480158511750784, + "grad_norm": 0.7808265089988708, + "learning_rate": 8.833811015822135e-06, + "loss": 0.8464, + "step": 8140 + }, + { + "epoch": 0.4480708899774341, + "grad_norm": 0.7837018370628357, + "learning_rate": 8.833532745650234e-06, + "loss": 0.8722, + "step": 8141 + }, + { + "epoch": 0.44812592877978974, + "grad_norm": 0.9218140840530396, + "learning_rate": 8.833254446666526e-06, + "loss": 0.7981, + "step": 8142 + }, + { + "epoch": 0.44818096758214543, + "grad_norm": 0.7980387806892395, + "learning_rate": 8.832976118873103e-06, + "loss": 0.7705, + "step": 8143 + }, + { + "epoch": 0.44823600638450106, + "grad_norm": 0.7354007363319397, + "learning_rate": 8.832697762272057e-06, + "loss": 0.8286, + "step": 8144 + }, + { + "epoch": 0.44829104518685675, + "grad_norm": 0.7006223201751709, + "learning_rate": 8.832419376865482e-06, + "loss": 0.7107, + "step": 8145 + }, + { + "epoch": 0.4483460839892124, + "grad_norm": 0.7838212847709656, + "learning_rate": 8.83214096265547e-06, + "loss": 0.7676, + "step": 8146 + }, + { + "epoch": 0.4484011227915681, + "grad_norm": 0.7768213748931885, + "learning_rate": 8.83186251964411e-06, + "loss": 0.8689, + "step": 8147 + }, + { + "epoch": 0.4484561615939237, + "grad_norm": 0.7451630234718323, + "learning_rate": 8.831584047833497e-06, + "loss": 0.8625, + "step": 8148 + }, + { + "epoch": 0.4485112003962794, + "grad_norm": 0.7573269605636597, + "learning_rate": 8.831305547225725e-06, + "loss": 0.7357, + "step": 8149 + }, + { + "epoch": 0.44856623919863503, + "grad_norm": 0.6884848475456238, + "learning_rate": 8.831027017822886e-06, + "loss": 0.7306, + "step": 8150 + }, + { + "epoch": 0.4486212780009907, + "grad_norm": 0.7715907096862793, + "learning_rate": 8.830748459627073e-06, + "loss": 0.8311, + "step": 8151 + }, + { + "epoch": 0.44867631680334635, + "grad_norm": 0.6919859647750854, + "learning_rate": 8.83046987264038e-06, + "loss": 0.845, + "step": 8152 + }, + { + "epoch": 0.44873135560570204, + "grad_norm": 0.7066411972045898, + "learning_rate": 8.830191256864902e-06, + "loss": 0.7554, + "step": 8153 + }, + { + "epoch": 0.4487863944080577, + "grad_norm": 0.754196047782898, + "learning_rate": 8.829912612302729e-06, + "loss": 0.7396, + "step": 8154 + }, + { + "epoch": 0.44884143321041337, + "grad_norm": 0.7612286806106567, + "learning_rate": 8.82963393895596e-06, + "loss": 0.8154, + "step": 8155 + }, + { + "epoch": 0.448896472012769, + "grad_norm": 0.8576892614364624, + "learning_rate": 8.829355236826688e-06, + "loss": 0.7395, + "step": 8156 + }, + { + "epoch": 0.4489515108151247, + "grad_norm": 0.6813738346099854, + "learning_rate": 8.829076505917005e-06, + "loss": 0.7661, + "step": 8157 + }, + { + "epoch": 0.4490065496174803, + "grad_norm": 0.7453964948654175, + "learning_rate": 8.828797746229009e-06, + "loss": 0.8221, + "step": 8158 + }, + { + "epoch": 0.449061588419836, + "grad_norm": 0.7546728849411011, + "learning_rate": 8.828518957764795e-06, + "loss": 0.7717, + "step": 8159 + }, + { + "epoch": 0.44911662722219164, + "grad_norm": 0.8270652890205383, + "learning_rate": 8.828240140526456e-06, + "loss": 0.7582, + "step": 8160 + }, + { + "epoch": 0.44917166602454733, + "grad_norm": 0.8188696503639221, + "learning_rate": 8.827961294516089e-06, + "loss": 0.8841, + "step": 8161 + }, + { + "epoch": 0.44922670482690297, + "grad_norm": 0.9101365208625793, + "learning_rate": 8.82768241973579e-06, + "loss": 0.7099, + "step": 8162 + }, + { + "epoch": 0.4492817436292586, + "grad_norm": 0.6749762892723083, + "learning_rate": 8.827403516187656e-06, + "loss": 0.7766, + "step": 8163 + }, + { + "epoch": 0.4493367824316143, + "grad_norm": 1.1351534128189087, + "learning_rate": 8.827124583873781e-06, + "loss": 0.7536, + "step": 8164 + }, + { + "epoch": 0.4493918212339699, + "grad_norm": 0.8729487061500549, + "learning_rate": 8.826845622796261e-06, + "loss": 0.8613, + "step": 8165 + }, + { + "epoch": 0.4494468600363256, + "grad_norm": 0.7495871782302856, + "learning_rate": 8.826566632957193e-06, + "loss": 0.8365, + "step": 8166 + }, + { + "epoch": 0.44950189883868125, + "grad_norm": 0.6414516568183899, + "learning_rate": 8.826287614358677e-06, + "loss": 0.6574, + "step": 8167 + }, + { + "epoch": 0.44955693764103694, + "grad_norm": 0.6954017281532288, + "learning_rate": 8.826008567002805e-06, + "loss": 0.7857, + "step": 8168 + }, + { + "epoch": 0.44961197644339257, + "grad_norm": 0.7199459075927734, + "learning_rate": 8.825729490891678e-06, + "loss": 0.8585, + "step": 8169 + }, + { + "epoch": 0.44966701524574826, + "grad_norm": 0.8245406746864319, + "learning_rate": 8.825450386027392e-06, + "loss": 0.7238, + "step": 8170 + }, + { + "epoch": 0.4497220540481039, + "grad_norm": 0.6348667740821838, + "learning_rate": 8.825171252412044e-06, + "loss": 0.6991, + "step": 8171 + }, + { + "epoch": 0.4497770928504596, + "grad_norm": 0.6304741501808167, + "learning_rate": 8.824892090047734e-06, + "loss": 0.7101, + "step": 8172 + }, + { + "epoch": 0.4498321316528152, + "grad_norm": 0.7088820338249207, + "learning_rate": 8.82461289893656e-06, + "loss": 0.8217, + "step": 8173 + }, + { + "epoch": 0.4498871704551709, + "grad_norm": 0.7570851445198059, + "learning_rate": 8.824333679080617e-06, + "loss": 0.8029, + "step": 8174 + }, + { + "epoch": 0.44994220925752654, + "grad_norm": 0.7544378042221069, + "learning_rate": 8.824054430482007e-06, + "loss": 0.777, + "step": 8175 + }, + { + "epoch": 0.4499972480598822, + "grad_norm": 0.8226260542869568, + "learning_rate": 8.823775153142827e-06, + "loss": 0.8391, + "step": 8176 + }, + { + "epoch": 0.45005228686223786, + "grad_norm": 0.6861422061920166, + "learning_rate": 8.823495847065176e-06, + "loss": 0.7491, + "step": 8177 + }, + { + "epoch": 0.45010732566459355, + "grad_norm": 0.6643275618553162, + "learning_rate": 8.823216512251153e-06, + "loss": 0.6773, + "step": 8178 + }, + { + "epoch": 0.4501623644669492, + "grad_norm": 0.8201391100883484, + "learning_rate": 8.82293714870286e-06, + "loss": 0.8065, + "step": 8179 + }, + { + "epoch": 0.45021740326930487, + "grad_norm": 0.7783405780792236, + "learning_rate": 8.822657756422394e-06, + "loss": 0.7884, + "step": 8180 + }, + { + "epoch": 0.4502724420716605, + "grad_norm": 0.720745861530304, + "learning_rate": 8.822378335411856e-06, + "loss": 0.765, + "step": 8181 } ], "logging_steps": 1, @@ -50930,7 +57293,7 @@ "attributes": {} } }, - "total_flos": 2.1460117712975954e+19, + "total_flos": 2.414263242709795e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null